embed-tree 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- embed_tree-0.0.6/PKG-INFO +182 -0
- embed_tree-0.0.6/README.md +164 -0
- embed_tree-0.0.6/embed_tree/__init__.py +121 -0
- embed_tree-0.0.6/embed_tree/cache/__init__.py +10 -0
- embed_tree-0.0.6/embed_tree/cache/json.py +7 -0
- embed_tree-0.0.6/embed_tree/cache/model.py +13 -0
- embed_tree-0.0.6/embed_tree/cache/sqlalchemy.py +7 -0
- embed_tree-0.0.6/embed_tree/config.py +100 -0
- embed_tree-0.0.6/embed_tree/embedders/__init__.py +7 -0
- embed_tree-0.0.6/embed_tree/embedders/huggingface.py +69 -0
- embed_tree-0.0.6/embed_tree/embedders/model.py +27 -0
- embed_tree-0.0.6/embed_tree/labelers/__init__.py +8 -0
- embed_tree-0.0.6/embed_tree/labelers/function.py +25 -0
- embed_tree-0.0.6/embed_tree/labelers/llm.py +26 -0
- embed_tree-0.0.6/embed_tree/labelers/model.py +38 -0
- embed_tree-0.0.6/embed_tree/loaders/__init__.py +17 -0
- embed_tree-0.0.6/embed_tree/loaders/filesystem.py +83 -0
- embed_tree-0.0.6/embed_tree/loaders/json.py +49 -0
- embed_tree-0.0.6/embed_tree/loaders/model.py +20 -0
- embed_tree-0.0.6/embed_tree/loaders/sqlalchemy.py +91 -0
- embed_tree-0.0.6/embed_tree/loaders/sqlalchemy_content.py +63 -0
- embed_tree-0.0.6/embed_tree/loaders/sqlite.py +21 -0
- embed_tree-0.0.6/embed_tree/persisters/__init__.py +15 -0
- embed_tree-0.0.6/embed_tree/persisters/filesystem.py +293 -0
- embed_tree-0.0.6/embed_tree/persisters/json.py +29 -0
- embed_tree-0.0.6/embed_tree/persisters/model.py +23 -0
- embed_tree-0.0.6/embed_tree/persisters/sqlalchemy.py +76 -0
- embed_tree-0.0.6/embed_tree/projectors/__init__.py +7 -0
- embed_tree-0.0.6/embed_tree/projectors/model.py +39 -0
- embed_tree-0.0.6/embed_tree/projectors/pca.py +57 -0
- embed_tree-0.0.6/embed_tree/providers/__init__.py +20 -0
- embed_tree-0.0.6/embed_tree/providers/base.py +104 -0
- embed_tree-0.0.6/embed_tree/providers/fake.py +26 -0
- embed_tree-0.0.6/embed_tree/providers/local.py +44 -0
- embed_tree-0.0.6/embed_tree/providers/openai.py +49 -0
- embed_tree-0.0.6/embed_tree/reconcilers/__init__.py +6 -0
- embed_tree-0.0.6/embed_tree/reconcilers/default.py +65 -0
- embed_tree-0.0.6/embed_tree/reconcilers/model.py +25 -0
- embed_tree-0.0.6/embed_tree/reducers.py +194 -0
- embed_tree-0.0.6/embed_tree/representation/__init__.py +27 -0
- embed_tree-0.0.6/embed_tree/representation/default.py +59 -0
- embed_tree-0.0.6/embed_tree/representation/model.py +87 -0
- embed_tree-0.0.6/embed_tree/store.py +5 -0
- embed_tree-0.0.6/embed_tree/stores/__init__.py +8 -0
- embed_tree-0.0.6/embed_tree/stores/file.py +32 -0
- embed_tree-0.0.6/embed_tree/stores/model.py +25 -0
- embed_tree-0.0.6/embed_tree/stores/null.py +16 -0
- embed_tree-0.0.6/embed_tree/taggers.py +132 -0
- embed_tree-0.0.6/embed_tree/tree.py +691 -0
- embed_tree-0.0.6/embed_tree.egg-info/PKG-INFO +182 -0
- embed_tree-0.0.6/embed_tree.egg-info/SOURCES.txt +62 -0
- embed_tree-0.0.6/embed_tree.egg-info/dependency_links.txt +1 -0
- embed_tree-0.0.6/embed_tree.egg-info/requires.txt +15 -0
- embed_tree-0.0.6/embed_tree.egg-info/top_level.txt +1 -0
- embed_tree-0.0.6/pyproject.toml +36 -0
- embed_tree-0.0.6/setup.cfg +4 -0
- embed_tree-0.0.6/tests/test_abstractions.py +212 -0
- embed_tree-0.0.6/tests/test_batch.py +88 -0
- embed_tree-0.0.6/tests/test_models.py +90 -0
- embed_tree-0.0.6/tests/test_pca.py +132 -0
- embed_tree-0.0.6/tests/test_providers.py +102 -0
- embed_tree-0.0.6/tests/test_taggers.py +78 -0
- embed_tree-0.0.6/tests/test_taxonomy.py +151 -0
- embed_tree-0.0.6/tests/test_tree.py +160 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: embed-tree
|
|
3
|
+
Version: 0.0.6
|
|
4
|
+
Summary: Incremental hierarchical clustering tree over content embeddings
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: numpy>=1.23
|
|
8
|
+
Requires-Dist: scikit-learn>=1.3
|
|
9
|
+
Requires-Dist: pydantic>=2
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
12
|
+
Provides-Extra: openai
|
|
13
|
+
Requires-Dist: openai>=1; extra == "openai"
|
|
14
|
+
Provides-Extra: local
|
|
15
|
+
Requires-Dist: sentence-transformers>=2; extra == "local"
|
|
16
|
+
Provides-Extra: sql
|
|
17
|
+
Requires-Dist: sqlalchemy>=2; extra == "sql"
|
|
18
|
+
|
|
19
|
+
# embed-tree
|
|
20
|
+
|
|
21
|
+
`embed-tree` turns content embeddings into a browsable, labeled hierarchy.
|
|
22
|
+
It is useful when you have documents, notes, tickets, search results, or any
|
|
23
|
+
other text-like records and want a small taxonomy that a person can inspect.
|
|
24
|
+
|
|
25
|
+
The library stays model-agnostic: you provide an embedder, and `embed-tree`
|
|
26
|
+
handles clustering, labeling, querying, deletion, and persistence.
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install embed-tree
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Optional adapters:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install "embed-tree[openai]" # OpenAI embeddings
|
|
38
|
+
pip install "embed-tree[local]" # sentence-transformers embeddings
|
|
39
|
+
pip install "embed-tree[sql]" # SQLAlchemy loaders/persisters
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Quick Start
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from embed_tree import EmbedTree, FakeEmbeddingProvider, TreeConfig
|
|
46
|
+
|
|
47
|
+
embedder = FakeEmbeddingProvider(dim=32) # deterministic demo embedder
|
|
48
|
+
tree = EmbedTree(
|
|
49
|
+
embedder=embedder,
|
|
50
|
+
config=TreeConfig(max_branches=5, leaf_capacity=10),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
tree.add_batch(
|
|
54
|
+
[
|
|
55
|
+
"Write import pipeline documentation",
|
|
56
|
+
"Fix login session refresh",
|
|
57
|
+
"Reduce report query latency",
|
|
58
|
+
"Add retry handling to data ingestion",
|
|
59
|
+
]
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
tree.organize() # rebuild a clean hierarchy and label every node
|
|
63
|
+
print(tree.show()) # human-readable outline
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Use a real embedding provider in production:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from embed_tree import EmbedTree, OpenAIEmbeddingProvider
|
|
70
|
+
|
|
71
|
+
embedder = OpenAIEmbeddingProvider(
|
|
72
|
+
model="text-embedding-3-small",
|
|
73
|
+
api_key="...",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
tree = EmbedTree(embedder=embedder)
|
|
77
|
+
tree.add("Some document text", payload={"source": "docs"})
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Core API
|
|
81
|
+
|
|
82
|
+
`EmbedTree` is the main entry point.
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
tree.add(content, item_id=None, payload=None, text=None)
|
|
86
|
+
tree.add_batch(contents, item_ids=None, payloads=None, texts=None)
|
|
87
|
+
tree.add_node(content_node)
|
|
88
|
+
tree.add_nodes(content_nodes)
|
|
89
|
+
tree.add_partial_tree(partial_tree)
|
|
90
|
+
|
|
91
|
+
tree.organize(tagger=None)
|
|
92
|
+
tree.rebalance()
|
|
93
|
+
tree.label(tagger=None)
|
|
94
|
+
|
|
95
|
+
tree.query(content, k=10, exhaustive=False)
|
|
96
|
+
tree.remove(item_id)
|
|
97
|
+
tree.remove_batch(item_ids)
|
|
98
|
+
|
|
99
|
+
tree.show(max_items=3)
|
|
100
|
+
tree.to_dict(max_items=5)
|
|
101
|
+
tree.get_tree()
|
|
102
|
+
len(tree)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
`content` is what gets embedded. `text` is the human-readable string used in
|
|
106
|
+
labels and browse output; it defaults to `content` when `content` is a string.
|
|
107
|
+
`payload` is returned in query results and exported browse data.
|
|
108
|
+
|
|
109
|
+
## Configuration
|
|
110
|
+
|
|
111
|
+
Configuration is explicit and code-driven through `TreeConfig`. It does not
|
|
112
|
+
read environment variables.
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from embed_tree import LLMConfig, RebalanceConfig, TreeConfig
|
|
116
|
+
|
|
117
|
+
config = TreeConfig(
|
|
118
|
+
max_branches=5,
|
|
119
|
+
leaf_capacity=10,
|
|
120
|
+
rebalance=RebalanceConfig(enabled=True, every_n_inserts=10_000),
|
|
121
|
+
llm=LLMConfig(provider="none"), # default keyword labels, no network
|
|
122
|
+
)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Defaults are tuned for readable taxonomies: small fan-out and small leaves.
|
|
126
|
+
Raise `max_branches` and `leaf_capacity` when using the tree primarily as a
|
|
127
|
+
retrieval index.
|
|
128
|
+
|
|
129
|
+
## Querying
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
hits = tree.query("related content", k=5)
|
|
133
|
+
# [(item_id, distance, payload), ...]
|
|
134
|
+
|
|
135
|
+
exact_hits = tree.query("related content", k=5, exhaustive=True)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
Default queries route to one leaf and rank items there, which is fast but
|
|
139
|
+
approximate. `exhaustive=True` scans every item for exact nearest neighbors.
|
|
140
|
+
|
|
141
|
+
## Persistence
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
from embed_tree import EmbedTree, FileTreeStore
|
|
145
|
+
|
|
146
|
+
tree = EmbedTree(
|
|
147
|
+
embedder=embedder,
|
|
148
|
+
store=FileTreeStore("./tree.json"),
|
|
149
|
+
)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
`FileTreeStore` saves an atomic JSON snapshot after writes and reloads it when
|
|
153
|
+
the tree is constructed again.
|
|
154
|
+
|
|
155
|
+
## Labeling
|
|
156
|
+
|
|
157
|
+
Without extra configuration, node labels are generated locally from keywords.
|
|
158
|
+
For LLM labels:
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
from embed_tree import LLMConfig, TreeConfig
|
|
162
|
+
|
|
163
|
+
config = TreeConfig(
|
|
164
|
+
llm=LLMConfig(provider="openai", model="gpt-4o-mini", api_key="...")
|
|
165
|
+
)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
You can also pass a custom `tagger: Callable[[list[str]], str]` to
|
|
169
|
+
`EmbedTree(..., tagger=...)`, `tree.label(tagger=...)`, or
|
|
170
|
+
`tree.organize(tagger=...)`.
|
|
171
|
+
|
|
172
|
+
## More Documentation
|
|
173
|
+
|
|
174
|
+
See [docs/API.md](docs/API.md) for the fuller API reference, provider details,
|
|
175
|
+
loader/persister abstractions, PCA options, and extension points.
|
|
176
|
+
|
|
177
|
+
## Development
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
uv sync --extra dev
|
|
181
|
+
uv run pytest -q
|
|
182
|
+
```
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# embed-tree
|
|
2
|
+
|
|
3
|
+
`embed-tree` turns content embeddings into a browsable, labeled hierarchy.
|
|
4
|
+
It is useful when you have documents, notes, tickets, search results, or any
|
|
5
|
+
other text-like records and want a small taxonomy that a person can inspect.
|
|
6
|
+
|
|
7
|
+
The library stays model-agnostic: you provide an embedder, and `embed-tree`
|
|
8
|
+
handles clustering, labeling, querying, deletion, and persistence.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install embed-tree
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Optional adapters:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install "embed-tree[openai]" # OpenAI embeddings
|
|
20
|
+
pip install "embed-tree[local]" # sentence-transformers embeddings
|
|
21
|
+
pip install "embed-tree[sql]" # SQLAlchemy loaders/persisters
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from embed_tree import EmbedTree, FakeEmbeddingProvider, TreeConfig
|
|
28
|
+
|
|
29
|
+
embedder = FakeEmbeddingProvider(dim=32) # deterministic demo embedder
|
|
30
|
+
tree = EmbedTree(
|
|
31
|
+
embedder=embedder,
|
|
32
|
+
config=TreeConfig(max_branches=5, leaf_capacity=10),
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
tree.add_batch(
|
|
36
|
+
[
|
|
37
|
+
"Write import pipeline documentation",
|
|
38
|
+
"Fix login session refresh",
|
|
39
|
+
"Reduce report query latency",
|
|
40
|
+
"Add retry handling to data ingestion",
|
|
41
|
+
]
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
tree.organize() # rebuild a clean hierarchy and label every node
|
|
45
|
+
print(tree.show()) # human-readable outline
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Use a real embedding provider in production:
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from embed_tree import EmbedTree, OpenAIEmbeddingProvider
|
|
52
|
+
|
|
53
|
+
embedder = OpenAIEmbeddingProvider(
|
|
54
|
+
model="text-embedding-3-small",
|
|
55
|
+
api_key="...",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
tree = EmbedTree(embedder=embedder)
|
|
59
|
+
tree.add("Some document text", payload={"source": "docs"})
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Core API
|
|
63
|
+
|
|
64
|
+
`EmbedTree` is the main entry point.
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
tree.add(content, item_id=None, payload=None, text=None)
|
|
68
|
+
tree.add_batch(contents, item_ids=None, payloads=None, texts=None)
|
|
69
|
+
tree.add_node(content_node)
|
|
70
|
+
tree.add_nodes(content_nodes)
|
|
71
|
+
tree.add_partial_tree(partial_tree)
|
|
72
|
+
|
|
73
|
+
tree.organize(tagger=None)
|
|
74
|
+
tree.rebalance()
|
|
75
|
+
tree.label(tagger=None)
|
|
76
|
+
|
|
77
|
+
tree.query(content, k=10, exhaustive=False)
|
|
78
|
+
tree.remove(item_id)
|
|
79
|
+
tree.remove_batch(item_ids)
|
|
80
|
+
|
|
81
|
+
tree.show(max_items=3)
|
|
82
|
+
tree.to_dict(max_items=5)
|
|
83
|
+
tree.get_tree()
|
|
84
|
+
len(tree)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
`content` is what gets embedded. `text` is the human-readable string used in
|
|
88
|
+
labels and browse output; it defaults to `content` when `content` is a string.
|
|
89
|
+
`payload` is returned in query results and exported browse data.
|
|
90
|
+
|
|
91
|
+
## Configuration
|
|
92
|
+
|
|
93
|
+
Configuration is explicit and code-driven through `TreeConfig`. It does not
|
|
94
|
+
read environment variables.
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from embed_tree import LLMConfig, RebalanceConfig, TreeConfig
|
|
98
|
+
|
|
99
|
+
config = TreeConfig(
|
|
100
|
+
max_branches=5,
|
|
101
|
+
leaf_capacity=10,
|
|
102
|
+
rebalance=RebalanceConfig(enabled=True, every_n_inserts=10_000),
|
|
103
|
+
llm=LLMConfig(provider="none"), # default keyword labels, no network
|
|
104
|
+
)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Defaults are tuned for readable taxonomies: small fan-out and small leaves.
|
|
108
|
+
Raise `max_branches` and `leaf_capacity` when using the tree primarily as a
|
|
109
|
+
retrieval index.
|
|
110
|
+
|
|
111
|
+
## Querying
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
hits = tree.query("related content", k=5)
|
|
115
|
+
# [(item_id, distance, payload), ...]
|
|
116
|
+
|
|
117
|
+
exact_hits = tree.query("related content", k=5, exhaustive=True)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Default queries route to one leaf and rank items there, which is fast but
|
|
121
|
+
approximate. `exhaustive=True` scans every item for exact nearest neighbors.
|
|
122
|
+
|
|
123
|
+
## Persistence
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from embed_tree import EmbedTree, FileTreeStore
|
|
127
|
+
|
|
128
|
+
tree = EmbedTree(
|
|
129
|
+
embedder=embedder,
|
|
130
|
+
store=FileTreeStore("./tree.json"),
|
|
131
|
+
)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
`FileTreeStore` saves an atomic JSON snapshot after writes and reloads it when
|
|
135
|
+
the tree is constructed again.
|
|
136
|
+
|
|
137
|
+
## Labeling
|
|
138
|
+
|
|
139
|
+
Without extra configuration, node labels are generated locally from keywords.
|
|
140
|
+
For LLM labels:
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from embed_tree import LLMConfig, TreeConfig
|
|
144
|
+
|
|
145
|
+
config = TreeConfig(
|
|
146
|
+
llm=LLMConfig(provider="openai", model="gpt-4o-mini", api_key="...")
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
You can also pass a custom `tagger: Callable[[list[str]], str]` to
|
|
151
|
+
`EmbedTree(..., tagger=...)`, `tree.label(tagger=...)`, or
|
|
152
|
+
`tree.organize(tagger=...)`.
|
|
153
|
+
|
|
154
|
+
## More Documentation
|
|
155
|
+
|
|
156
|
+
See [docs/API.md](docs/API.md) for the fuller API reference, provider details,
|
|
157
|
+
loader/persister abstractions, PCA options, and extension points.
|
|
158
|
+
|
|
159
|
+
## Development
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
uv sync --extra dev
|
|
163
|
+
uv run pytest -q
|
|
164
|
+
```
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""embed-tree: an incremental hierarchical clustering tree over embeddings.
|
|
2
|
+
|
|
3
|
+
See DESIGN.md for the full design. Minimal usage:
|
|
4
|
+
|
|
5
|
+
from embed_tree import EmbedTree, TreeConfig, FileTreeStore
|
|
6
|
+
|
|
7
|
+
tree = EmbedTree(embedder=my_embed_fn, store=FileTreeStore("./tree.json"))
|
|
8
|
+
tree.add("some content")
|
|
9
|
+
hits = tree.query("similar content", k=5)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from .config import LLMConfig, RebalanceConfig, TreeConfig
|
|
13
|
+
from .embedders import HuggingFaceTextEmbedder, TextEmbedder, embed_texts
|
|
14
|
+
from .labelers import FunctionLabeler, LabelCandidate, Labeler, LabelRequest, LLMLabeler
|
|
15
|
+
from .loaders import (
|
|
16
|
+
FileSystemTreeLoader,
|
|
17
|
+
JsonTreeLoader,
|
|
18
|
+
SQLAlchemyContentLoader,
|
|
19
|
+
SQLAlchemyTreeLoader,
|
|
20
|
+
SQLiteTreeLoader,
|
|
21
|
+
TreeLoader,
|
|
22
|
+
)
|
|
23
|
+
from .persisters import (
|
|
24
|
+
FileSystemTreePersister,
|
|
25
|
+
FolderTreePersister,
|
|
26
|
+
JsonTreePersister,
|
|
27
|
+
MaterializedTreeState,
|
|
28
|
+
SQLAlchemyTreePersister,
|
|
29
|
+
TreePersister,
|
|
30
|
+
)
|
|
31
|
+
from .projectors import PCAConfig, PCAProjector, VectorProjector
|
|
32
|
+
from .providers import (
|
|
33
|
+
EmbeddingProvider,
|
|
34
|
+
FakeEmbeddingProvider,
|
|
35
|
+
OpenAIEmbeddingProvider,
|
|
36
|
+
SentenceTransformerProvider,
|
|
37
|
+
)
|
|
38
|
+
from .reducers import (
|
|
39
|
+
FreezePCAReducer,
|
|
40
|
+
IdentityReducer,
|
|
41
|
+
IncrementalPCAReducer,
|
|
42
|
+
Reducer,
|
|
43
|
+
)
|
|
44
|
+
from .reconcilers import DefaultTreeReconciler, TreeReconciler
|
|
45
|
+
from .representation import (
|
|
46
|
+
ContentNode,
|
|
47
|
+
DefaultTreeRepresentation,
|
|
48
|
+
KeyNode,
|
|
49
|
+
NodeAggregate,
|
|
50
|
+
NodeEmbedding,
|
|
51
|
+
NodeId,
|
|
52
|
+
PartialTree,
|
|
53
|
+
TreeEdge,
|
|
54
|
+
VectorData,
|
|
55
|
+
partial_tree_from_dict,
|
|
56
|
+
partial_tree_to_dict,
|
|
57
|
+
)
|
|
58
|
+
from .store import FileTreeStore, NullTreeStore, TreeState, TreeStore
|
|
59
|
+
from .taggers import KeywordTagger, LLMTagger, Tagger, make_tagger
|
|
60
|
+
from .tree import EmbedTree, Item, Node
|
|
61
|
+
|
|
62
|
+
__all__ = [
|
|
63
|
+
"EmbedTree",
|
|
64
|
+
"Item",
|
|
65
|
+
"Node",
|
|
66
|
+
"TreeConfig",
|
|
67
|
+
"RebalanceConfig",
|
|
68
|
+
"LLMConfig",
|
|
69
|
+
"TextEmbedder",
|
|
70
|
+
"HuggingFaceTextEmbedder",
|
|
71
|
+
"embed_texts",
|
|
72
|
+
"PCAConfig",
|
|
73
|
+
"PCAProjector",
|
|
74
|
+
"VectorProjector",
|
|
75
|
+
"LabelCandidate",
|
|
76
|
+
"LabelRequest",
|
|
77
|
+
"Labeler",
|
|
78
|
+
"FunctionLabeler",
|
|
79
|
+
"LLMLabeler",
|
|
80
|
+
"TreeLoader",
|
|
81
|
+
"FileSystemTreeLoader",
|
|
82
|
+
"JsonTreeLoader",
|
|
83
|
+
"SQLAlchemyContentLoader",
|
|
84
|
+
"SQLAlchemyTreeLoader",
|
|
85
|
+
"SQLiteTreeLoader",
|
|
86
|
+
"MaterializedTreeState",
|
|
87
|
+
"TreeReconciler",
|
|
88
|
+
"DefaultTreeReconciler",
|
|
89
|
+
"TreePersister",
|
|
90
|
+
"FolderTreePersister",
|
|
91
|
+
"FileSystemTreePersister",
|
|
92
|
+
"JsonTreePersister",
|
|
93
|
+
"SQLAlchemyTreePersister",
|
|
94
|
+
"DefaultTreeRepresentation",
|
|
95
|
+
"PartialTree",
|
|
96
|
+
"ContentNode",
|
|
97
|
+
"KeyNode",
|
|
98
|
+
"TreeEdge",
|
|
99
|
+
"NodeEmbedding",
|
|
100
|
+
"NodeAggregate",
|
|
101
|
+
"NodeId",
|
|
102
|
+
"VectorData",
|
|
103
|
+
"partial_tree_from_dict",
|
|
104
|
+
"partial_tree_to_dict",
|
|
105
|
+
"TreeState",
|
|
106
|
+
"TreeStore",
|
|
107
|
+
"FileTreeStore",
|
|
108
|
+
"NullTreeStore",
|
|
109
|
+
"EmbeddingProvider",
|
|
110
|
+
"FakeEmbeddingProvider",
|
|
111
|
+
"OpenAIEmbeddingProvider",
|
|
112
|
+
"SentenceTransformerProvider",
|
|
113
|
+
"Reducer",
|
|
114
|
+
"IdentityReducer",
|
|
115
|
+
"FreezePCAReducer",
|
|
116
|
+
"IncrementalPCAReducer",
|
|
117
|
+
"Tagger",
|
|
118
|
+
"KeywordTagger",
|
|
119
|
+
"LLMTagger",
|
|
120
|
+
"make_tagger",
|
|
121
|
+
]
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Deprecated cache compatibility imports.
|
|
2
|
+
|
|
3
|
+
Use loaders plus persisters directly for new code.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from .json import JsonTreeCache
|
|
7
|
+
from .model import MaterializedTreeState, TreeCache
|
|
8
|
+
from .sqlalchemy import SQLAlchemyTreeCache
|
|
9
|
+
|
|
10
|
+
__all__ = ["MaterializedTreeState", "TreeCache", "JsonTreeCache", "SQLAlchemyTreeCache"]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Deprecated cache compatibility contract."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
from embed_tree.loaders.model import TreeLoader
|
|
8
|
+
from embed_tree.persisters.model import MaterializedTreeState, TreePersister
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@runtime_checkable
|
|
12
|
+
class TreeCache(TreeLoader, TreePersister, Protocol):
|
|
13
|
+
"""Deprecated alias for TreeLoader + TreePersister."""
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Configuration for embed-tree, as a single pydantic object.
|
|
2
|
+
|
|
3
|
+
`TreeConfig` is a plain pydantic `BaseModel` (NOT `BaseSettings`): it is
|
|
4
|
+
constructed explicitly and handed whole to `EmbedTree(config=...)`. It does
|
|
5
|
+
**not** read environment variables — every value must be passed in code, so the
|
|
6
|
+
configuration is always explicit and reproducible.
|
|
7
|
+
|
|
8
|
+
See DESIGN.md §4/§5.3. M1 adds PCA dimensionality reduction (pca_dims) in two
|
|
9
|
+
modes (freeze / incremental); see those sections for the rebalance contract.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import Literal
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RebalanceConfig(BaseModel):
|
|
20
|
+
"""When/whether to rebuild the whole tree from its leaves (DESIGN.md §4)."""
|
|
21
|
+
|
|
22
|
+
enabled: bool = True
|
|
23
|
+
every_n_inserts: int | None = 10_000 # auto-rebuild cadence; None disables
|
|
24
|
+
on_demand: bool = True # allow manual tree.rebalance()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LLMConfig(BaseModel):
|
|
28
|
+
"""How to auto-name taxonomy nodes (DESIGN.md §10). Provider "none" uses a
|
|
29
|
+
no-network keyword tagger; "openai" and "local" generate labels with an LLM.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
provider: Literal["none", "openai", "local"] = "none"
|
|
33
|
+
model: str = "gpt-4o-mini" # OpenAI model id, or HF model id when local
|
|
34
|
+
api_key: str | None = None # OpenAI key (explicit; no env)
|
|
35
|
+
base_url: str | None = None # OpenAI-compatible endpoint (e.g. a local server)
|
|
36
|
+
max_samples: int = 15 # member texts shown to the LLM when naming a cluster
|
|
37
|
+
max_label_words: int = 6 # keep labels short and browsable
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class TreeConfig(BaseModel):
|
|
41
|
+
"""Top-level knobs. Defaults are tuned for the M0/M1 (<100k items) regime.
|
|
42
|
+
|
|
43
|
+
Constructed in code only — no environment-variable loading.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
model_config = ConfigDict(protected_namespaces=()) # allow `model_args` name
|
|
47
|
+
|
|
48
|
+
# Defaults are tuned for a human-browsable taxonomy (DESIGN.md §10): a small
|
|
49
|
+
# fan-out and small leaves keep every level readable (<=5 sub-topics, <=10
|
|
50
|
+
# items per leaf). Raise both for a large-scale retrieval index instead.
|
|
51
|
+
max_branches: int = 5 # max sub-topics per level (k for KMeans)
|
|
52
|
+
leaf_capacity: int = 10 # max items in a leaf before it subdivides
|
|
53
|
+
split_algo: str = "kmeans" # M0: "kmeans" only
|
|
54
|
+
|
|
55
|
+
# Distance is always cosine: vectors are L2-normalized and compared with
|
|
56
|
+
# Euclidean (rank-equivalent on the unit sphere). Embeddings encode meaning
|
|
57
|
+
# in direction, not magnitude, so there is no separate distance knob.
|
|
58
|
+
|
|
59
|
+
# --- PCA dimensionality reduction (M1; see DESIGN.md §5.3) -------------
|
|
60
|
+
# Off by default: only worth it at scale (thousands+). At tens of items PCA
|
|
61
|
+
# is meaningless (too few samples) and never reaches pca_warmup anyway.
|
|
62
|
+
pca_dims: int | None = None # None = no reduction (operate in raw space)
|
|
63
|
+
pca_mode: Literal["freeze", "incremental"] = "freeze"
|
|
64
|
+
pca_warmup: int = 1000 # items buffered before the first PCA fit
|
|
65
|
+
pca_batch_size: int = 256 # incremental: partial_fit cadence
|
|
66
|
+
|
|
67
|
+
rebalance: RebalanceConfig = Field(default_factory=RebalanceConfig)
|
|
68
|
+
llm: LLMConfig = Field(default_factory=LLMConfig) # node auto-naming
|
|
69
|
+
model_args: dict = Field(default_factory=dict) # passed through to KMeans
|
|
70
|
+
|
|
71
|
+
@field_validator("max_branches")
|
|
72
|
+
@classmethod
|
|
73
|
+
def _min_branches(cls, v: int) -> int:
|
|
74
|
+
if v < 2:
|
|
75
|
+
raise ValueError("max_branches must be >= 2")
|
|
76
|
+
return v
|
|
77
|
+
|
|
78
|
+
@field_validator("split_algo")
|
|
79
|
+
@classmethod
|
|
80
|
+
def _supported_split(cls, v: str) -> str:
|
|
81
|
+
if v != "kmeans":
|
|
82
|
+
raise NotImplementedError(
|
|
83
|
+
f"split_algo={v!r} arrives in a later milestone; "
|
|
84
|
+
"M0 supports 'kmeans' only"
|
|
85
|
+
)
|
|
86
|
+
return v
|
|
87
|
+
|
|
88
|
+
@model_validator(mode="after")
|
|
89
|
+
def _cross_field(self) -> "TreeConfig":
|
|
90
|
+
if self.leaf_capacity < self.max_branches:
|
|
91
|
+
raise ValueError("leaf_capacity must be >= max_branches")
|
|
92
|
+
if self.pca_dims is not None:
|
|
93
|
+
if self.pca_dims < 2:
|
|
94
|
+
raise ValueError("pca_dims must be >= 2")
|
|
95
|
+
# PCA needs at least n_components samples to fit / partial_fit.
|
|
96
|
+
if self.pca_warmup < self.pca_dims:
|
|
97
|
+
raise ValueError("pca_warmup must be >= pca_dims")
|
|
98
|
+
if self.pca_batch_size < self.pca_dims:
|
|
99
|
+
raise ValueError("pca_batch_size must be >= pca_dims")
|
|
100
|
+
return self
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Open-source local embeddings via Hugging Face sentence-transformers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from embed_tree.providers.local import SentenceTransformerProvider
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HuggingFaceTextEmbedder(SentenceTransformerProvider):
|
|
14
|
+
"""Sentence-transformers embedder with Mac-friendly device selection.
|
|
15
|
+
|
|
16
|
+
The model is downloaded from Hugging Face by sentence-transformers on first
|
|
17
|
+
use and then cached by that stack. On Apple Silicon, device="auto" prefers
|
|
18
|
+
MPS when PyTorch reports it as available; otherwise it falls back to CPU.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
model: str = "BAAI/bge-small-en-v1.5",
|
|
24
|
+
*,
|
|
25
|
+
device: str | None = "auto",
|
|
26
|
+
cache_folder: str | Path | None = None,
|
|
27
|
+
model_obj: Any | None = None,
|
|
28
|
+
encode_kwargs: dict[str, Any] | None = None,
|
|
29
|
+
**kwargs: Any,
|
|
30
|
+
) -> None:
|
|
31
|
+
resolved_device = _resolve_device(device)
|
|
32
|
+
self.model_name = model
|
|
33
|
+
self.device = resolved_device
|
|
34
|
+
self.cache_folder = None if cache_folder is None else str(cache_folder)
|
|
35
|
+
if model_obj is None:
|
|
36
|
+
try:
|
|
37
|
+
from sentence_transformers import SentenceTransformer
|
|
38
|
+
except ImportError as e: # pragma: no cover
|
|
39
|
+
raise ImportError(
|
|
40
|
+
'HuggingFaceTextEmbedder needs the "local" extra: '
|
|
41
|
+
'pip install "embed-tree[local]"'
|
|
42
|
+
) from e
|
|
43
|
+
model_obj = SentenceTransformer(model, device=resolved_device, cache_folder=self.cache_folder)
|
|
44
|
+
super().__init__(model=model, device=resolved_device, model_obj=model_obj, encode_kwargs=encode_kwargs, **kwargs)
|
|
45
|
+
else:
|
|
46
|
+
super().__init__(model=model, device=resolved_device, model_obj=model_obj, encode_kwargs=encode_kwargs, **kwargs)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _resolve_device(device: str | None) -> str | None:
|
|
50
|
+
if device != "auto":
|
|
51
|
+
return device
|
|
52
|
+
try:
|
|
53
|
+
import torch
|
|
54
|
+
except ImportError:
|
|
55
|
+
return None
|
|
56
|
+
if getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
|
|
57
|
+
return "mps"
|
|
58
|
+
if torch.cuda.is_available():
|
|
59
|
+
return "cuda"
|
|
60
|
+
return "cpu"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def embed_texts(embedder: Any, texts: list[str]) -> np.ndarray:
|
|
64
|
+
"""Embed a batch through any callable or TextEmbedder-like object."""
|
|
65
|
+
batch_fn = getattr(embedder, "embed_batch", None)
|
|
66
|
+
if callable(batch_fn):
|
|
67
|
+
return np.asarray(batch_fn(texts))
|
|
68
|
+
return np.asarray([embedder(text) for text in texts])
|
|
69
|
+
|