embed-tree 0.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. embed_tree-0.0.6/PKG-INFO +182 -0
  2. embed_tree-0.0.6/README.md +164 -0
  3. embed_tree-0.0.6/embed_tree/__init__.py +121 -0
  4. embed_tree-0.0.6/embed_tree/cache/__init__.py +10 -0
  5. embed_tree-0.0.6/embed_tree/cache/json.py +7 -0
  6. embed_tree-0.0.6/embed_tree/cache/model.py +13 -0
  7. embed_tree-0.0.6/embed_tree/cache/sqlalchemy.py +7 -0
  8. embed_tree-0.0.6/embed_tree/config.py +100 -0
  9. embed_tree-0.0.6/embed_tree/embedders/__init__.py +7 -0
  10. embed_tree-0.0.6/embed_tree/embedders/huggingface.py +69 -0
  11. embed_tree-0.0.6/embed_tree/embedders/model.py +27 -0
  12. embed_tree-0.0.6/embed_tree/labelers/__init__.py +8 -0
  13. embed_tree-0.0.6/embed_tree/labelers/function.py +25 -0
  14. embed_tree-0.0.6/embed_tree/labelers/llm.py +26 -0
  15. embed_tree-0.0.6/embed_tree/labelers/model.py +38 -0
  16. embed_tree-0.0.6/embed_tree/loaders/__init__.py +17 -0
  17. embed_tree-0.0.6/embed_tree/loaders/filesystem.py +83 -0
  18. embed_tree-0.0.6/embed_tree/loaders/json.py +49 -0
  19. embed_tree-0.0.6/embed_tree/loaders/model.py +20 -0
  20. embed_tree-0.0.6/embed_tree/loaders/sqlalchemy.py +91 -0
  21. embed_tree-0.0.6/embed_tree/loaders/sqlalchemy_content.py +63 -0
  22. embed_tree-0.0.6/embed_tree/loaders/sqlite.py +21 -0
  23. embed_tree-0.0.6/embed_tree/persisters/__init__.py +15 -0
  24. embed_tree-0.0.6/embed_tree/persisters/filesystem.py +293 -0
  25. embed_tree-0.0.6/embed_tree/persisters/json.py +29 -0
  26. embed_tree-0.0.6/embed_tree/persisters/model.py +23 -0
  27. embed_tree-0.0.6/embed_tree/persisters/sqlalchemy.py +76 -0
  28. embed_tree-0.0.6/embed_tree/projectors/__init__.py +7 -0
  29. embed_tree-0.0.6/embed_tree/projectors/model.py +39 -0
  30. embed_tree-0.0.6/embed_tree/projectors/pca.py +57 -0
  31. embed_tree-0.0.6/embed_tree/providers/__init__.py +20 -0
  32. embed_tree-0.0.6/embed_tree/providers/base.py +104 -0
  33. embed_tree-0.0.6/embed_tree/providers/fake.py +26 -0
  34. embed_tree-0.0.6/embed_tree/providers/local.py +44 -0
  35. embed_tree-0.0.6/embed_tree/providers/openai.py +49 -0
  36. embed_tree-0.0.6/embed_tree/reconcilers/__init__.py +6 -0
  37. embed_tree-0.0.6/embed_tree/reconcilers/default.py +65 -0
  38. embed_tree-0.0.6/embed_tree/reconcilers/model.py +25 -0
  39. embed_tree-0.0.6/embed_tree/reducers.py +194 -0
  40. embed_tree-0.0.6/embed_tree/representation/__init__.py +27 -0
  41. embed_tree-0.0.6/embed_tree/representation/default.py +59 -0
  42. embed_tree-0.0.6/embed_tree/representation/model.py +87 -0
  43. embed_tree-0.0.6/embed_tree/store.py +5 -0
  44. embed_tree-0.0.6/embed_tree/stores/__init__.py +8 -0
  45. embed_tree-0.0.6/embed_tree/stores/file.py +32 -0
  46. embed_tree-0.0.6/embed_tree/stores/model.py +25 -0
  47. embed_tree-0.0.6/embed_tree/stores/null.py +16 -0
  48. embed_tree-0.0.6/embed_tree/taggers.py +132 -0
  49. embed_tree-0.0.6/embed_tree/tree.py +691 -0
  50. embed_tree-0.0.6/embed_tree.egg-info/PKG-INFO +182 -0
  51. embed_tree-0.0.6/embed_tree.egg-info/SOURCES.txt +62 -0
  52. embed_tree-0.0.6/embed_tree.egg-info/dependency_links.txt +1 -0
  53. embed_tree-0.0.6/embed_tree.egg-info/requires.txt +15 -0
  54. embed_tree-0.0.6/embed_tree.egg-info/top_level.txt +1 -0
  55. embed_tree-0.0.6/pyproject.toml +36 -0
  56. embed_tree-0.0.6/setup.cfg +4 -0
  57. embed_tree-0.0.6/tests/test_abstractions.py +212 -0
  58. embed_tree-0.0.6/tests/test_batch.py +88 -0
  59. embed_tree-0.0.6/tests/test_models.py +90 -0
  60. embed_tree-0.0.6/tests/test_pca.py +132 -0
  61. embed_tree-0.0.6/tests/test_providers.py +102 -0
  62. embed_tree-0.0.6/tests/test_taggers.py +78 -0
  63. embed_tree-0.0.6/tests/test_taxonomy.py +151 -0
  64. embed_tree-0.0.6/tests/test_tree.py +160 -0
@@ -0,0 +1,182 @@
1
+ Metadata-Version: 2.4
2
+ Name: embed-tree
3
+ Version: 0.0.6
4
+ Summary: Incremental hierarchical clustering tree over content embeddings
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ Requires-Dist: numpy>=1.23
8
+ Requires-Dist: scikit-learn>=1.3
9
+ Requires-Dist: pydantic>=2
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=7; extra == "dev"
12
+ Provides-Extra: openai
13
+ Requires-Dist: openai>=1; extra == "openai"
14
+ Provides-Extra: local
15
+ Requires-Dist: sentence-transformers>=2; extra == "local"
16
+ Provides-Extra: sql
17
+ Requires-Dist: sqlalchemy>=2; extra == "sql"
18
+
19
+ # embed-tree
20
+
21
+ `embed-tree` turns content embeddings into a browsable, labeled hierarchy.
22
+ It is useful when you have documents, notes, tickets, search results, or any
23
+ other text-like records and want a small taxonomy that a person can inspect.
24
+
25
+ The library stays model-agnostic: you provide an embedder, and `embed-tree`
26
+ handles clustering, labeling, querying, deletion, and persistence.
27
+
28
+ ## Install
29
+
30
+ ```bash
31
+ pip install embed-tree
32
+ ```
33
+
34
+ Optional adapters:
35
+
36
+ ```bash
37
+ pip install "embed-tree[openai]" # OpenAI embeddings
38
+ pip install "embed-tree[local]" # sentence-transformers embeddings
39
+ pip install "embed-tree[sql]" # SQLAlchemy loaders/persisters
40
+ ```
41
+
42
+ ## Quick Start
43
+
44
+ ```python
45
+ from embed_tree import EmbedTree, FakeEmbeddingProvider, TreeConfig
46
+
47
+ embedder = FakeEmbeddingProvider(dim=32) # deterministic demo embedder
48
+ tree = EmbedTree(
49
+ embedder=embedder,
50
+ config=TreeConfig(max_branches=5, leaf_capacity=10),
51
+ )
52
+
53
+ tree.add_batch(
54
+ [
55
+ "Write import pipeline documentation",
56
+ "Fix login session refresh",
57
+ "Reduce report query latency",
58
+ "Add retry handling to data ingestion",
59
+ ]
60
+ )
61
+
62
+ tree.organize() # rebuild a clean hierarchy and label every node
63
+ print(tree.show()) # human-readable outline
64
+ ```
65
+
66
+ Use a real embedding provider in production:
67
+
68
+ ```python
69
+ from embed_tree import EmbedTree, OpenAIEmbeddingProvider
70
+
71
+ embedder = OpenAIEmbeddingProvider(
72
+ model="text-embedding-3-small",
73
+ api_key="...",
74
+ )
75
+
76
+ tree = EmbedTree(embedder=embedder)
77
+ tree.add("Some document text", payload={"source": "docs"})
78
+ ```
79
+
80
+ ## Core API
81
+
82
+ `EmbedTree` is the main entry point.
83
+
84
+ ```python
85
+ tree.add(content, item_id=None, payload=None, text=None)
86
+ tree.add_batch(contents, item_ids=None, payloads=None, texts=None)
87
+ tree.add_node(content_node)
88
+ tree.add_nodes(content_nodes)
89
+ tree.add_partial_tree(partial_tree)
90
+
91
+ tree.organize(tagger=None)
92
+ tree.rebalance()
93
+ tree.label(tagger=None)
94
+
95
+ tree.query(content, k=10, exhaustive=False)
96
+ tree.remove(item_id)
97
+ tree.remove_batch(item_ids)
98
+
99
+ tree.show(max_items=3)
100
+ tree.to_dict(max_items=5)
101
+ tree.get_tree()
102
+ len(tree)
103
+ ```
104
+
105
+ `content` is what gets embedded. `text` is the human-readable string used in
106
+ labels and browse output; it defaults to `content` when `content` is a string.
107
+ `payload` is returned in query results and exported browse data.
108
+
109
+ ## Configuration
110
+
111
+ Configuration is explicit and code-driven through `TreeConfig`. It does not
112
+ read environment variables.
113
+
114
+ ```python
115
+ from embed_tree import LLMConfig, RebalanceConfig, TreeConfig
116
+
117
+ config = TreeConfig(
118
+ max_branches=5,
119
+ leaf_capacity=10,
120
+ rebalance=RebalanceConfig(enabled=True, every_n_inserts=10_000),
121
+ llm=LLMConfig(provider="none"), # default keyword labels, no network
122
+ )
123
+ ```
124
+
125
+ Defaults are tuned for readable taxonomies: small fan-out and small leaves.
126
+ Raise `max_branches` and `leaf_capacity` when using the tree primarily as a
127
+ retrieval index.
128
+
129
+ ## Querying
130
+
131
+ ```python
132
+ hits = tree.query("related content", k=5)
133
+ # [(item_id, distance, payload), ...]
134
+
135
+ exact_hits = tree.query("related content", k=5, exhaustive=True)
136
+ ```
137
+
138
+ Default queries route to one leaf and rank items there, which is fast but
139
+ approximate. `exhaustive=True` scans every item for exact nearest neighbors.
140
+
141
+ ## Persistence
142
+
143
+ ```python
144
+ from embed_tree import EmbedTree, FileTreeStore
145
+
146
+ tree = EmbedTree(
147
+ embedder=embedder,
148
+ store=FileTreeStore("./tree.json"),
149
+ )
150
+ ```
151
+
152
+ `FileTreeStore` saves an atomic JSON snapshot after writes and reloads it when
153
+ the tree is constructed again.
154
+
155
+ ## Labeling
156
+
157
+ Without extra configuration, node labels are generated locally from keywords.
158
+ For LLM labels:
159
+
160
+ ```python
161
+ from embed_tree import LLMConfig, TreeConfig
162
+
163
+ config = TreeConfig(
164
+ llm=LLMConfig(provider="openai", model="gpt-4o-mini", api_key="...")
165
+ )
166
+ ```
167
+
168
+ You can also pass a custom `tagger: Callable[[list[str]], str]` to
169
+ `EmbedTree(..., tagger=...)`, `tree.label(tagger=...)`, or
170
+ `tree.organize(tagger=...)`.
171
+
172
+ ## More Documentation
173
+
174
+ See [docs/API.md](docs/API.md) for the fuller API reference, provider details,
175
+ loader/persister abstractions, PCA options, and extension points.
176
+
177
+ ## Development
178
+
179
+ ```bash
180
+ uv sync --extra dev
181
+ uv run pytest -q
182
+ ```
@@ -0,0 +1,164 @@
1
+ # embed-tree
2
+
3
+ `embed-tree` turns content embeddings into a browsable, labeled hierarchy.
4
+ It is useful when you have documents, notes, tickets, search results, or any
5
+ other text-like records and want a small taxonomy that a person can inspect.
6
+
7
+ The library stays model-agnostic: you provide an embedder, and `embed-tree`
8
+ handles clustering, labeling, querying, deletion, and persistence.
9
+
10
+ ## Install
11
+
12
+ ```bash
13
+ pip install embed-tree
14
+ ```
15
+
16
+ Optional adapters:
17
+
18
+ ```bash
19
+ pip install "embed-tree[openai]" # OpenAI embeddings
20
+ pip install "embed-tree[local]" # sentence-transformers embeddings
21
+ pip install "embed-tree[sql]" # SQLAlchemy loaders/persisters
22
+ ```
23
+
24
+ ## Quick Start
25
+
26
+ ```python
27
+ from embed_tree import EmbedTree, FakeEmbeddingProvider, TreeConfig
28
+
29
+ embedder = FakeEmbeddingProvider(dim=32) # deterministic demo embedder
30
+ tree = EmbedTree(
31
+ embedder=embedder,
32
+ config=TreeConfig(max_branches=5, leaf_capacity=10),
33
+ )
34
+
35
+ tree.add_batch(
36
+ [
37
+ "Write import pipeline documentation",
38
+ "Fix login session refresh",
39
+ "Reduce report query latency",
40
+ "Add retry handling to data ingestion",
41
+ ]
42
+ )
43
+
44
+ tree.organize() # rebuild a clean hierarchy and label every node
45
+ print(tree.show()) # human-readable outline
46
+ ```
47
+
48
+ Use a real embedding provider in production:
49
+
50
+ ```python
51
+ from embed_tree import EmbedTree, OpenAIEmbeddingProvider
52
+
53
+ embedder = OpenAIEmbeddingProvider(
54
+ model="text-embedding-3-small",
55
+ api_key="...",
56
+ )
57
+
58
+ tree = EmbedTree(embedder=embedder)
59
+ tree.add("Some document text", payload={"source": "docs"})
60
+ ```
61
+
62
+ ## Core API
63
+
64
+ `EmbedTree` is the main entry point.
65
+
66
+ ```python
67
+ tree.add(content, item_id=None, payload=None, text=None)
68
+ tree.add_batch(contents, item_ids=None, payloads=None, texts=None)
69
+ tree.add_node(content_node)
70
+ tree.add_nodes(content_nodes)
71
+ tree.add_partial_tree(partial_tree)
72
+
73
+ tree.organize(tagger=None)
74
+ tree.rebalance()
75
+ tree.label(tagger=None)
76
+
77
+ tree.query(content, k=10, exhaustive=False)
78
+ tree.remove(item_id)
79
+ tree.remove_batch(item_ids)
80
+
81
+ tree.show(max_items=3)
82
+ tree.to_dict(max_items=5)
83
+ tree.get_tree()
84
+ len(tree)
85
+ ```
86
+
87
+ `content` is what gets embedded. `text` is the human-readable string used in
88
+ labels and browse output; it defaults to `content` when `content` is a string.
89
+ `payload` is returned in query results and exported browse data.
90
+
91
+ ## Configuration
92
+
93
+ Configuration is explicit and code-driven through `TreeConfig`. It does not
94
+ read environment variables.
95
+
96
+ ```python
97
+ from embed_tree import LLMConfig, RebalanceConfig, TreeConfig
98
+
99
+ config = TreeConfig(
100
+ max_branches=5,
101
+ leaf_capacity=10,
102
+ rebalance=RebalanceConfig(enabled=True, every_n_inserts=10_000),
103
+ llm=LLMConfig(provider="none"), # default keyword labels, no network
104
+ )
105
+ ```
106
+
107
+ Defaults are tuned for readable taxonomies: small fan-out and small leaves.
108
+ Raise `max_branches` and `leaf_capacity` when using the tree primarily as a
109
+ retrieval index.
110
+
111
+ ## Querying
112
+
113
+ ```python
114
+ hits = tree.query("related content", k=5)
115
+ # [(item_id, distance, payload), ...]
116
+
117
+ exact_hits = tree.query("related content", k=5, exhaustive=True)
118
+ ```
119
+
120
+ Default queries route to one leaf and rank items there, which is fast but
121
+ approximate. `exhaustive=True` scans every item for exact nearest neighbors.
122
+
123
+ ## Persistence
124
+
125
+ ```python
126
+ from embed_tree import EmbedTree, FileTreeStore
127
+
128
+ tree = EmbedTree(
129
+ embedder=embedder,
130
+ store=FileTreeStore("./tree.json"),
131
+ )
132
+ ```
133
+
134
+ `FileTreeStore` saves an atomic JSON snapshot after writes and reloads it when
135
+ the tree is constructed again.
136
+
137
+ ## Labeling
138
+
139
+ Without extra configuration, node labels are generated locally from keywords.
140
+ For LLM labels:
141
+
142
+ ```python
143
+ from embed_tree import LLMConfig, TreeConfig
144
+
145
+ config = TreeConfig(
146
+ llm=LLMConfig(provider="openai", model="gpt-4o-mini", api_key="...")
147
+ )
148
+ ```
149
+
150
+ You can also pass a custom `tagger: Callable[[list[str]], str]` to
151
+ `EmbedTree(..., tagger=...)`, `tree.label(tagger=...)`, or
152
+ `tree.organize(tagger=...)`.
153
+
154
+ ## More Documentation
155
+
156
+ See [docs/API.md](docs/API.md) for the fuller API reference, provider details,
157
+ loader/persister abstractions, PCA options, and extension points.
158
+
159
+ ## Development
160
+
161
+ ```bash
162
+ uv sync --extra dev
163
+ uv run pytest -q
164
+ ```
@@ -0,0 +1,121 @@
1
+ """embed-tree: an incremental hierarchical clustering tree over embeddings.
2
+
3
+ See DESIGN.md for the full design. Minimal usage:
4
+
5
+ from embed_tree import EmbedTree, TreeConfig, FileTreeStore
6
+
7
+ tree = EmbedTree(embedder=my_embed_fn, store=FileTreeStore("./tree.json"))
8
+ tree.add("some content")
9
+ hits = tree.query("similar content", k=5)
10
+ """
11
+
12
+ from .config import LLMConfig, RebalanceConfig, TreeConfig
13
+ from .embedders import HuggingFaceTextEmbedder, TextEmbedder, embed_texts
14
+ from .labelers import FunctionLabeler, LabelCandidate, Labeler, LabelRequest, LLMLabeler
15
+ from .loaders import (
16
+ FileSystemTreeLoader,
17
+ JsonTreeLoader,
18
+ SQLAlchemyContentLoader,
19
+ SQLAlchemyTreeLoader,
20
+ SQLiteTreeLoader,
21
+ TreeLoader,
22
+ )
23
+ from .persisters import (
24
+ FileSystemTreePersister,
25
+ FolderTreePersister,
26
+ JsonTreePersister,
27
+ MaterializedTreeState,
28
+ SQLAlchemyTreePersister,
29
+ TreePersister,
30
+ )
31
+ from .projectors import PCAConfig, PCAProjector, VectorProjector
32
+ from .providers import (
33
+ EmbeddingProvider,
34
+ FakeEmbeddingProvider,
35
+ OpenAIEmbeddingProvider,
36
+ SentenceTransformerProvider,
37
+ )
38
+ from .reducers import (
39
+ FreezePCAReducer,
40
+ IdentityReducer,
41
+ IncrementalPCAReducer,
42
+ Reducer,
43
+ )
44
+ from .reconcilers import DefaultTreeReconciler, TreeReconciler
45
+ from .representation import (
46
+ ContentNode,
47
+ DefaultTreeRepresentation,
48
+ KeyNode,
49
+ NodeAggregate,
50
+ NodeEmbedding,
51
+ NodeId,
52
+ PartialTree,
53
+ TreeEdge,
54
+ VectorData,
55
+ partial_tree_from_dict,
56
+ partial_tree_to_dict,
57
+ )
58
+ from .store import FileTreeStore, NullTreeStore, TreeState, TreeStore
59
+ from .taggers import KeywordTagger, LLMTagger, Tagger, make_tagger
60
+ from .tree import EmbedTree, Item, Node
61
+
62
+ __all__ = [
63
+ "EmbedTree",
64
+ "Item",
65
+ "Node",
66
+ "TreeConfig",
67
+ "RebalanceConfig",
68
+ "LLMConfig",
69
+ "TextEmbedder",
70
+ "HuggingFaceTextEmbedder",
71
+ "embed_texts",
72
+ "PCAConfig",
73
+ "PCAProjector",
74
+ "VectorProjector",
75
+ "LabelCandidate",
76
+ "LabelRequest",
77
+ "Labeler",
78
+ "FunctionLabeler",
79
+ "LLMLabeler",
80
+ "TreeLoader",
81
+ "FileSystemTreeLoader",
82
+ "JsonTreeLoader",
83
+ "SQLAlchemyContentLoader",
84
+ "SQLAlchemyTreeLoader",
85
+ "SQLiteTreeLoader",
86
+ "MaterializedTreeState",
87
+ "TreeReconciler",
88
+ "DefaultTreeReconciler",
89
+ "TreePersister",
90
+ "FolderTreePersister",
91
+ "FileSystemTreePersister",
92
+ "JsonTreePersister",
93
+ "SQLAlchemyTreePersister",
94
+ "DefaultTreeRepresentation",
95
+ "PartialTree",
96
+ "ContentNode",
97
+ "KeyNode",
98
+ "TreeEdge",
99
+ "NodeEmbedding",
100
+ "NodeAggregate",
101
+ "NodeId",
102
+ "VectorData",
103
+ "partial_tree_from_dict",
104
+ "partial_tree_to_dict",
105
+ "TreeState",
106
+ "TreeStore",
107
+ "FileTreeStore",
108
+ "NullTreeStore",
109
+ "EmbeddingProvider",
110
+ "FakeEmbeddingProvider",
111
+ "OpenAIEmbeddingProvider",
112
+ "SentenceTransformerProvider",
113
+ "Reducer",
114
+ "IdentityReducer",
115
+ "FreezePCAReducer",
116
+ "IncrementalPCAReducer",
117
+ "Tagger",
118
+ "KeywordTagger",
119
+ "LLMTagger",
120
+ "make_tagger",
121
+ ]
@@ -0,0 +1,10 @@
1
+ """Deprecated cache compatibility imports.
2
+
3
+ Use loaders plus persisters directly for new code.
4
+ """
5
+
6
+ from .json import JsonTreeCache
7
+ from .model import MaterializedTreeState, TreeCache
8
+ from .sqlalchemy import SQLAlchemyTreeCache
9
+
10
+ __all__ = ["MaterializedTreeState", "TreeCache", "JsonTreeCache", "SQLAlchemyTreeCache"]
@@ -0,0 +1,7 @@
1
+ """Compatibility import for JSON tree cache."""
2
+
3
+ from embed_tree.loaders.json import JsonTreeLoader
4
+
5
+
6
+ class JsonTreeCache(JsonTreeLoader):
7
+ """Backward-compatible name for JsonTreeLoader."""
@@ -0,0 +1,13 @@
1
+ """Deprecated cache compatibility contract."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol, runtime_checkable
6
+
7
+ from embed_tree.loaders.model import TreeLoader
8
+ from embed_tree.persisters.model import MaterializedTreeState, TreePersister
9
+
10
+
11
+ @runtime_checkable
12
+ class TreeCache(TreeLoader, TreePersister, Protocol):
13
+ """Deprecated alias for TreeLoader + TreePersister."""
@@ -0,0 +1,7 @@
1
+ """Compatibility import for SQLAlchemy tree cache."""
2
+
3
+ from embed_tree.loaders.sqlalchemy import SQLAlchemyTreeLoader
4
+
5
+
6
+ class SQLAlchemyTreeCache(SQLAlchemyTreeLoader):
7
+ """Backward-compatible name for SQLAlchemyTreeLoader."""
@@ -0,0 +1,100 @@
1
+ """Configuration for embed-tree, as a single pydantic object.
2
+
3
+ `TreeConfig` is a plain pydantic `BaseModel` (NOT `BaseSettings`): it is
4
+ constructed explicitly and handed whole to `EmbedTree(config=...)`. It does
5
+ **not** read environment variables — every value must be passed in code, so the
6
+ configuration is always explicit and reproducible.
7
+
8
+ See DESIGN.md §4/§5.3. M1 adds PCA dimensionality reduction (pca_dims) in two
9
+ modes (freeze / incremental); see those sections for the rebalance contract.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from typing import Literal
15
+
16
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
17
+
18
+
19
+ class RebalanceConfig(BaseModel):
20
+ """When/whether to rebuild the whole tree from its leaves (DESIGN.md §4)."""
21
+
22
+ enabled: bool = True
23
+ every_n_inserts: int | None = 10_000 # auto-rebuild cadence; None disables
24
+ on_demand: bool = True # allow manual tree.rebalance()
25
+
26
+
27
+ class LLMConfig(BaseModel):
28
+ """How to auto-name taxonomy nodes (DESIGN.md §10). Provider "none" uses a
29
+ no-network keyword tagger; "openai" and "local" generate labels with an LLM.
30
+ """
31
+
32
+ provider: Literal["none", "openai", "local"] = "none"
33
+ model: str = "gpt-4o-mini" # OpenAI model id, or HF model id when local
34
+ api_key: str | None = None # OpenAI key (explicit; no env)
35
+ base_url: str | None = None # OpenAI-compatible endpoint (e.g. a local server)
36
+ max_samples: int = 15 # member texts shown to the LLM when naming a cluster
37
+ max_label_words: int = 6 # keep labels short and browsable
38
+
39
+
40
+ class TreeConfig(BaseModel):
41
+ """Top-level knobs. Defaults are tuned for the M0/M1 (<100k items) regime.
42
+
43
+ Constructed in code only — no environment-variable loading.
44
+ """
45
+
46
+ model_config = ConfigDict(protected_namespaces=()) # allow `model_args` name
47
+
48
+ # Defaults are tuned for a human-browsable taxonomy (DESIGN.md §10): a small
49
+ # fan-out and small leaves keep every level readable (<=5 sub-topics, <=10
50
+ # items per leaf). Raise both for a large-scale retrieval index instead.
51
+ max_branches: int = 5 # max sub-topics per level (k for KMeans)
52
+ leaf_capacity: int = 10 # max items in a leaf before it subdivides
53
+ split_algo: str = "kmeans" # M0: "kmeans" only
54
+
55
+ # Distance is always cosine: vectors are L2-normalized and compared with
56
+ # Euclidean (rank-equivalent on the unit sphere). Embeddings encode meaning
57
+ # in direction, not magnitude, so there is no separate distance knob.
58
+
59
+ # --- PCA dimensionality reduction (M1; see DESIGN.md §5.3) -------------
60
+ # Off by default: only worth it at scale (thousands+). At tens of items PCA
61
+ # is meaningless (too few samples) and never reaches pca_warmup anyway.
62
+ pca_dims: int | None = None # None = no reduction (operate in raw space)
63
+ pca_mode: Literal["freeze", "incremental"] = "freeze"
64
+ pca_warmup: int = 1000 # items buffered before the first PCA fit
65
+ pca_batch_size: int = 256 # incremental: partial_fit cadence
66
+
67
+ rebalance: RebalanceConfig = Field(default_factory=RebalanceConfig)
68
+ llm: LLMConfig = Field(default_factory=LLMConfig) # node auto-naming
69
+ model_args: dict = Field(default_factory=dict) # passed through to KMeans
70
+
71
+ @field_validator("max_branches")
72
+ @classmethod
73
+ def _min_branches(cls, v: int) -> int:
74
+ if v < 2:
75
+ raise ValueError("max_branches must be >= 2")
76
+ return v
77
+
78
+ @field_validator("split_algo")
79
+ @classmethod
80
+ def _supported_split(cls, v: str) -> str:
81
+ if v != "kmeans":
82
+ raise NotImplementedError(
83
+ f"split_algo={v!r} arrives in a later milestone; "
84
+ "M0 supports 'kmeans' only"
85
+ )
86
+ return v
87
+
88
+ @model_validator(mode="after")
89
+ def _cross_field(self) -> "TreeConfig":
90
+ if self.leaf_capacity < self.max_branches:
91
+ raise ValueError("leaf_capacity must be >= max_branches")
92
+ if self.pca_dims is not None:
93
+ if self.pca_dims < 2:
94
+ raise ValueError("pca_dims must be >= 2")
95
+ # PCA needs at least n_components samples to fit / partial_fit.
96
+ if self.pca_warmup < self.pca_dims:
97
+ raise ValueError("pca_warmup must be >= pca_dims")
98
+ if self.pca_batch_size < self.pca_dims:
99
+ raise ValueError("pca_batch_size must be >= pca_dims")
100
+ return self
@@ -0,0 +1,7 @@
1
+ """Embedding model integrations."""
2
+
3
+ from .huggingface import HuggingFaceTextEmbedder, embed_texts
4
+ from .model import TextEmbedder, Vector
5
+
6
+ __all__ = ["TextEmbedder", "Vector", "HuggingFaceTextEmbedder", "embed_texts"]
7
+
@@ -0,0 +1,69 @@
1
+ """Open-source local embeddings via Hugging Face sentence-transformers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+
10
+ from embed_tree.providers.local import SentenceTransformerProvider
11
+
12
+
13
+ class HuggingFaceTextEmbedder(SentenceTransformerProvider):
14
+ """Sentence-transformers embedder with Mac-friendly device selection.
15
+
16
+ The model is downloaded from Hugging Face by sentence-transformers on first
17
+ use and then cached by that stack. On Apple Silicon, device="auto" prefers
18
+ MPS when PyTorch reports it as available; otherwise it falls back to CPU.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ model: str = "BAAI/bge-small-en-v1.5",
24
+ *,
25
+ device: str | None = "auto",
26
+ cache_folder: str | Path | None = None,
27
+ model_obj: Any | None = None,
28
+ encode_kwargs: dict[str, Any] | None = None,
29
+ **kwargs: Any,
30
+ ) -> None:
31
+ resolved_device = _resolve_device(device)
32
+ self.model_name = model
33
+ self.device = resolved_device
34
+ self.cache_folder = None if cache_folder is None else str(cache_folder)
35
+ if model_obj is None:
36
+ try:
37
+ from sentence_transformers import SentenceTransformer
38
+ except ImportError as e: # pragma: no cover
39
+ raise ImportError(
40
+ 'HuggingFaceTextEmbedder needs the "local" extra: '
41
+ 'pip install "embed-tree[local]"'
42
+ ) from e
43
+ model_obj = SentenceTransformer(model, device=resolved_device, cache_folder=self.cache_folder)
44
+ super().__init__(model=model, device=resolved_device, model_obj=model_obj, encode_kwargs=encode_kwargs, **kwargs)
45
+ else:
46
+ super().__init__(model=model, device=resolved_device, model_obj=model_obj, encode_kwargs=encode_kwargs, **kwargs)
47
+
48
+
49
+ def _resolve_device(device: str | None) -> str | None:
50
+ if device != "auto":
51
+ return device
52
+ try:
53
+ import torch
54
+ except ImportError:
55
+ return None
56
+ if getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
57
+ return "mps"
58
+ if torch.cuda.is_available():
59
+ return "cuda"
60
+ return "cpu"
61
+
62
+
63
+ def embed_texts(embedder: Any, texts: list[str]) -> np.ndarray:
64
+ """Embed a batch through any callable or TextEmbedder-like object."""
65
+ batch_fn = getattr(embedder, "embed_batch", None)
66
+ if callable(batch_fn):
67
+ return np.asarray(batch_fn(texts))
68
+ return np.asarray([embedder(text) for text in texts])
69
+