lnclite 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lnclite-0.1.0/LICENSE +21 -0
- lnclite-0.1.0/PKG-INFO +101 -0
- lnclite-0.1.0/README.md +75 -0
- lnclite-0.1.0/lnclite/__init__.py +826 -0
- lnclite-0.1.0/lnclite/file_ingestor.py +133 -0
- lnclite-0.1.0/lnclite/utils/__init__.py +0 -0
- lnclite-0.1.0/lnclite/utils/calculate_index_params.py +18 -0
- lnclite-0.1.0/lnclite/utils/get_file_fp.py +13 -0
- lnclite-0.1.0/lnclite/utils/get_folder_fingerprint.py +37 -0
- lnclite-0.1.0/lnclite/utils/snowflake.py +95 -0
- lnclite-0.1.0/pyproject.toml +59 -0
lnclite-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 AllenChou
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
lnclite-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lnclite
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lite usages of lancedb.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: Allen Chou
|
|
8
|
+
Author-email: f1470891079@gmail.com
|
|
9
|
+
Requires-Python: >=3.11,<4
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
15
|
+
Requires-Dist: lancedb
|
|
16
|
+
Requires-Dist: openai
|
|
17
|
+
Requires-Dist: openai-embeddings-model
|
|
18
|
+
Requires-Dist: paginatic
|
|
19
|
+
Requires-Dist: pydantic (>=2)
|
|
20
|
+
Requires-Dist: xxhash
|
|
21
|
+
Project-URL: Homepage, https://github.com/allen2c/lnclite
|
|
22
|
+
Project-URL: PyPI, https://pypi.org/project/lnclite/
|
|
23
|
+
Project-URL: Repository, https://github.com/allen2c/lnclite
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# lnclite
|
|
27
|
+
|
|
28
|
+
`lnclite` is a small async LanceDB document store for OpenAI-compatible embeddings. It gives you a compact API for creating a local vector database, adding documents, filtering by tags, and running semantic search.
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install lnclite
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
For local development from this repository:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
poetry install --all-groups
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Quick Start
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
import asyncio
|
|
46
|
+
|
|
47
|
+
from openai import AsyncOpenAI
|
|
48
|
+
from openai_embeddings_model import ModelSettings
|
|
49
|
+
|
|
50
|
+
from lnclite import DocumentCreate, Lnclite, get_openai_embeddings_model
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
async def main():
|
|
54
|
+
embeddings = get_openai_embeddings_model(
|
|
55
|
+
openai_client=AsyncOpenAI(),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
client = await Lnclite.new(
|
|
59
|
+
lancedb_path="outputs/demo.lance",
|
|
60
|
+
openai_embeddings_model=embeddings,
|
|
61
|
+
model_settings=ModelSettings(dimensions=1536),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
await client.documents.batch_create(
|
|
65
|
+
[
|
|
66
|
+
DocumentCreate(
|
|
67
|
+
content="A note about async Python clients.",
|
|
68
|
+
tags=["type:note", "topic:python"],
|
|
69
|
+
),
|
|
70
|
+
DocumentCreate(
|
|
71
|
+
content="A note about vector search and indexing.",
|
|
72
|
+
tags=["type:note", "topic:search"],
|
|
73
|
+
),
|
|
74
|
+
]
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
await client.create_index()
|
|
78
|
+
|
|
79
|
+
results = await client.search(
|
|
80
|
+
"How should I design vector search?",
|
|
81
|
+
tags_any=["topic:search"],
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
for result in results.results:
|
|
85
|
+
print(result.document.content)
|
|
86
|
+
print(result.document.tags)
|
|
87
|
+
print(result.distance)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
asyncio.run(main())
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Documentation
|
|
95
|
+
|
|
96
|
+
Full documentation is published with MkDocs Material from this repository's `docs/` directory.
|
|
97
|
+
|
|
98
|
+
## License
|
|
99
|
+
|
|
100
|
+
MIT
|
|
101
|
+
|
lnclite-0.1.0/README.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# lnclite
|
|
2
|
+
|
|
3
|
+
`lnclite` is a small async LanceDB document store for OpenAI-compatible embeddings. It gives you a compact API for creating a local vector database, adding documents, filtering by tags, and running semantic search.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install lnclite
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
For local development from this repository:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
poetry install --all-groups
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Quick Start
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
import asyncio
|
|
21
|
+
|
|
22
|
+
from openai import AsyncOpenAI
|
|
23
|
+
from openai_embeddings_model import ModelSettings
|
|
24
|
+
|
|
25
|
+
from lnclite import DocumentCreate, Lnclite, get_openai_embeddings_model
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
async def main():
|
|
29
|
+
embeddings = get_openai_embeddings_model(
|
|
30
|
+
openai_client=AsyncOpenAI(),
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
client = await Lnclite.new(
|
|
34
|
+
lancedb_path="outputs/demo.lance",
|
|
35
|
+
openai_embeddings_model=embeddings,
|
|
36
|
+
model_settings=ModelSettings(dimensions=1536),
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
await client.documents.batch_create(
|
|
40
|
+
[
|
|
41
|
+
DocumentCreate(
|
|
42
|
+
content="A note about async Python clients.",
|
|
43
|
+
tags=["type:note", "topic:python"],
|
|
44
|
+
),
|
|
45
|
+
DocumentCreate(
|
|
46
|
+
content="A note about vector search and indexing.",
|
|
47
|
+
tags=["type:note", "topic:search"],
|
|
48
|
+
),
|
|
49
|
+
]
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
await client.create_index()
|
|
53
|
+
|
|
54
|
+
results = await client.search(
|
|
55
|
+
"How should I design vector search?",
|
|
56
|
+
tags_any=["topic:search"],
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
for result in results.results:
|
|
60
|
+
print(result.document.content)
|
|
61
|
+
print(result.document.tags)
|
|
62
|
+
print(result.distance)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
asyncio.run(main())
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Documentation
|
|
70
|
+
|
|
71
|
+
Full documentation is published with MkDocs Material from this repository's `docs/` directory.
|
|
72
|
+
|
|
73
|
+
## License
|
|
74
|
+
|
|
75
|
+
MIT
|
|
@@ -0,0 +1,826 @@
|
|
|
1
|
+
"""Small async LanceDB document store with OpenAI embeddings."""
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import hashlib
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import (
|
|
9
|
+
TYPE_CHECKING,
|
|
10
|
+
Dict,
|
|
11
|
+
Final,
|
|
12
|
+
List,
|
|
13
|
+
Literal,
|
|
14
|
+
Optional,
|
|
15
|
+
Text,
|
|
16
|
+
Type,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
import diskcache
|
|
20
|
+
import lancedb
|
|
21
|
+
import lancedb.index
|
|
22
|
+
import numpy as np
|
|
23
|
+
import tiktoken
|
|
24
|
+
from lancedb.pydantic import LanceModel, Vector
|
|
25
|
+
from openai import AsyncOpenAI
|
|
26
|
+
from openai_embeddings_model import MAX_BATCH_SIZE as DEFAULT_EMBEDDINGS_MAX_BATCH_SIZE
|
|
27
|
+
from openai_embeddings_model import (
|
|
28
|
+
AsyncOpenAIEmbeddingsModel,
|
|
29
|
+
ModelSettings,
|
|
30
|
+
)
|
|
31
|
+
from openai_embeddings_model.normalize import normalize
|
|
32
|
+
from paginatic import TokenPaginatic
|
|
33
|
+
from paginatic.helpers import decode_and_verify, encode_and_sign
|
|
34
|
+
from pydantic import BaseModel, Field, model_validator
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from lnclite.file_ingestor import FileReader
|
|
38
|
+
|
|
39
|
+
__version__: Final[Text] = "0.1.0"
|
|
40
|
+
__all__: Final[List[Text]] = [
|
|
41
|
+
"Document",
|
|
42
|
+
"DocumentCreate",
|
|
43
|
+
"Lnclite",
|
|
44
|
+
"LncliteNotFoundError",
|
|
45
|
+
"ManifestModel",
|
|
46
|
+
"SearchResult",
|
|
47
|
+
"SearchResults",
|
|
48
|
+
"get_model_settings",
|
|
49
|
+
"get_openai_embeddings_model",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
logger = logging.getLogger(__name__)
|
|
53
|
+
|
|
54
|
+
DEFAULT_MANIFEST_TABLE = "manifest"
|
|
55
|
+
DEFAULT_DOCUMENT_TABLE = "documents"
|
|
56
|
+
DEFAULT_OPENAI_MODEL = "text-embedding-3-small"
|
|
57
|
+
DEFAULT_MAX_INPUT_TOKENS = 4096
|
|
58
|
+
DEFAULT_DIMENSIONS = 1536
|
|
59
|
+
|
|
60
|
+
VectorIndexPreference = Literal["storage", "balanced", "accuracy", "latency"]
|
|
61
|
+
ListOrder = Literal["asc", "desc", 1, -1]
|
|
62
|
+
SqlOrder = Literal["ASC", "DESC"]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def gen_id() -> str:
|
|
66
|
+
from lnclite.utils.snowflake import generate_id
|
|
67
|
+
|
|
68
|
+
return generate_id()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@functools.cache
|
|
72
|
+
def get_document_lancedb_model(dim: int) -> Type[LanceModel]:
|
|
73
|
+
class DocumentLancedbModel(LanceModel):
|
|
74
|
+
id: int = Field(default_factory=gen_id)
|
|
75
|
+
content: Text = Field(description="The content of the document.") # noqa: E501
|
|
76
|
+
md5: Text = ""
|
|
77
|
+
vector: Vector(dim)
|
|
78
|
+
tags: List[Text] = Field(default_factory=list)
|
|
79
|
+
|
|
80
|
+
@model_validator(mode="after")
|
|
81
|
+
def validate_values(self) -> "DocumentLancedbModel":
|
|
82
|
+
self.content = self.content.strip()
|
|
83
|
+
if not self.content:
|
|
84
|
+
raise ValueError("Content cannot be empty")
|
|
85
|
+
self.md5 = hashlib.md5(self.content.encode()).hexdigest()
|
|
86
|
+
return self
|
|
87
|
+
|
|
88
|
+
return DocumentLancedbModel
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@functools.cache
|
|
92
|
+
def get_openai_client() -> AsyncOpenAI:
|
|
93
|
+
return AsyncOpenAI()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@functools.cache
|
|
97
|
+
def get_embeddings_cache() -> diskcache.Cache:
|
|
98
|
+
return diskcache.Cache(".cache/embeddings")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@functools.cache
|
|
102
|
+
def get_encoding_for_model(model: str) -> tiktoken.Encoding:
|
|
103
|
+
try:
|
|
104
|
+
return tiktoken.encoding_for_model(model)
|
|
105
|
+
except KeyError:
|
|
106
|
+
logger.warning(
|
|
107
|
+
f"Encoding for model {model} not found, using default encoding 'gpt-5"
|
|
108
|
+
)
|
|
109
|
+
return tiktoken.encoding_for_model("gpt-5")
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def get_openai_embeddings_model(
|
|
113
|
+
model: str = DEFAULT_OPENAI_MODEL,
|
|
114
|
+
openai_client: AsyncOpenAI | None = None,
|
|
115
|
+
cache: diskcache.Cache | None = None,
|
|
116
|
+
encoding: tiktoken.Encoding | None = None,
|
|
117
|
+
max_batch_size: int = DEFAULT_EMBEDDINGS_MAX_BATCH_SIZE,
|
|
118
|
+
max_input_tokens: int = DEFAULT_MAX_INPUT_TOKENS,
|
|
119
|
+
) -> "AsyncOpenAIEmbeddingsModel":
|
|
120
|
+
return AsyncOpenAIEmbeddingsModel(
|
|
121
|
+
model=model,
|
|
122
|
+
openai_client=openai_client or get_openai_client(),
|
|
123
|
+
cache=cache or get_embeddings_cache(),
|
|
124
|
+
encoding=encoding or get_encoding_for_model(model),
|
|
125
|
+
max_batch_size=max_batch_size,
|
|
126
|
+
max_input_tokens=max_input_tokens,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def get_model_settings(dimensions: int = DEFAULT_DIMENSIONS) -> "ModelSettings":
|
|
131
|
+
return ModelSettings(dimensions=dimensions)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def quote_sql_string(s: str) -> str:
|
|
135
|
+
return "'" + s.replace("'", "''") + "'"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def tag_filter_any(tags: list[str]) -> str:
|
|
139
|
+
values = ", ".join(quote_sql_string(tag) for tag in tags)
|
|
140
|
+
return f"array_has_any(tags, [{values}])"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def tag_filter_all(tags: list[str]) -> str:
|
|
144
|
+
values = ", ".join(quote_sql_string(tag) for tag in tags)
|
|
145
|
+
return f"array_has_all(tags, [{values}])"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def recommended_vector_index_config(
|
|
149
|
+
row_count: int,
|
|
150
|
+
dim: int,
|
|
151
|
+
*,
|
|
152
|
+
prefer: VectorIndexPreference = "balanced",
|
|
153
|
+
):
|
|
154
|
+
"""Return a LanceDB vector index config for dot-search.
|
|
155
|
+
|
|
156
|
+
Assumes document vectors and query vectors are normalized.
|
|
157
|
+
Returns None when brute-force is better or when there are not enough rows.
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
# Too small. Brute-force is exact and fast.
|
|
161
|
+
# Also avoids PQ training errors.
|
|
162
|
+
if row_count < 256:
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
# Still small. Brute-force is usually fine.
|
|
166
|
+
# If you really want an index, IvfFlat is safer than PQ.
|
|
167
|
+
if row_count < 10_000:
|
|
168
|
+
if prefer in {"accuracy", "latency"}:
|
|
169
|
+
return lancedb.index.IvfFlat(
|
|
170
|
+
distance_type="dot",
|
|
171
|
+
num_partitions=32,
|
|
172
|
+
)
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
if row_count < 50_000:
|
|
176
|
+
return lancedb.index.IvfFlat(
|
|
177
|
+
distance_type="dot",
|
|
178
|
+
num_partitions=128,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
if row_count < 100_000:
|
|
182
|
+
if prefer == "storage":
|
|
183
|
+
return lancedb.index.IvfPq(
|
|
184
|
+
distance_type="dot",
|
|
185
|
+
num_partitions=256,
|
|
186
|
+
num_sub_vectors=recommended_num_sub_vectors(dim, prefer),
|
|
187
|
+
num_bits=8,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return lancedb.index.IvfFlat(
|
|
191
|
+
distance_type="dot",
|
|
192
|
+
num_partitions=256,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
if row_count < 500_000:
|
|
196
|
+
return lancedb.index.IvfPq(
|
|
197
|
+
distance_type="dot",
|
|
198
|
+
num_partitions=1024 if prefer == "storage" else 2048,
|
|
199
|
+
num_sub_vectors=recommended_num_sub_vectors(dim, prefer),
|
|
200
|
+
num_bits=8,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
if row_count < 1_000_000:
|
|
204
|
+
return lancedb.index.IvfPq(
|
|
205
|
+
distance_type="dot",
|
|
206
|
+
num_partitions=2048 if prefer == "storage" else 4096,
|
|
207
|
+
num_sub_vectors=recommended_num_sub_vectors(dim, prefer),
|
|
208
|
+
num_bits=8,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
if prefer == "latency":
|
|
212
|
+
return lancedb.index.HnswPq(
|
|
213
|
+
distance_type="dot",
|
|
214
|
+
m=20,
|
|
215
|
+
ef_construction=300,
|
|
216
|
+
num_sub_vectors=recommended_num_sub_vectors(dim, "balanced"),
|
|
217
|
+
num_bits=8,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
return lancedb.index.IvfPq(
|
|
221
|
+
distance_type="dot",
|
|
222
|
+
num_partitions=4096 if prefer in {"storage", "balanced"} else 8192,
|
|
223
|
+
num_sub_vectors=recommended_num_sub_vectors(dim, prefer),
|
|
224
|
+
num_bits=8,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def recommended_num_sub_vectors(
|
|
229
|
+
dim: int, prefer: VectorIndexPreference = "balanced"
|
|
230
|
+
) -> int:
|
|
231
|
+
"""Return a PQ subvector count.
|
|
232
|
+
|
|
233
|
+
Higher = more accurate, larger index.
|
|
234
|
+
Lower = more compressed, lower recall.
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
# Prefer subvector sizes around 8~16 dimensions.
|
|
238
|
+
if prefer == "storage":
|
|
239
|
+
target_sub_dim = 16
|
|
240
|
+
elif prefer == "accuracy":
|
|
241
|
+
target_sub_dim = 8
|
|
242
|
+
else:
|
|
243
|
+
target_sub_dim = 12
|
|
244
|
+
|
|
245
|
+
candidates = [x for x in range(1, dim + 1) if dim % x == 0]
|
|
246
|
+
return min(candidates, key=lambda x: abs((dim / x) - target_sub_dim))
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
async def get_default_dimensions(
|
|
250
|
+
openai_embeddings_model: AsyncOpenAIEmbeddingsModel,
|
|
251
|
+
) -> int:
|
|
252
|
+
emb_result = await openai_embeddings_model.get_embeddings(
|
|
253
|
+
input="Hello, world!",
|
|
254
|
+
model_settings=ModelSettings(),
|
|
255
|
+
)
|
|
256
|
+
return emb_result.to_numpy().shape[1]
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class ManifestLancedbModel(LanceModel):
|
|
260
|
+
id: int = Field(default_factory=gen_id)
|
|
261
|
+
name: Text = Field(description="The name of the database.")
|
|
262
|
+
description: Text = Field(description="The description of the database.")
|
|
263
|
+
model: Text = Field(description="The embedding model name.")
|
|
264
|
+
dimensions: int = Field(description="The dimensions of the embeddings.")
|
|
265
|
+
last_updated: int = Field(default_factory=lambda: int(time.time()))
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class ManifestModel(LanceModel):
|
|
269
|
+
id: int
|
|
270
|
+
name: Text
|
|
271
|
+
description: Text
|
|
272
|
+
model: Text
|
|
273
|
+
dimensions: int
|
|
274
|
+
last_updated: int
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
class DocumentCreate(BaseModel):
|
|
278
|
+
content: Text
|
|
279
|
+
tags: List[Text] = Field(default_factory=list)
|
|
280
|
+
|
|
281
|
+
@model_validator(mode="after")
|
|
282
|
+
def validate_values(self) -> "DocumentCreate":
|
|
283
|
+
self.content = self.content.strip()
|
|
284
|
+
if not self.content:
|
|
285
|
+
raise ValueError("Content cannot be empty")
|
|
286
|
+
return self
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class Document(BaseModel):
|
|
290
|
+
id: int
|
|
291
|
+
content: Text
|
|
292
|
+
md5: Text
|
|
293
|
+
vector: Optional[List[float]]
|
|
294
|
+
tags: List[Text]
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
class Lnclite:
|
|
298
|
+
def __init__(
|
|
299
|
+
self,
|
|
300
|
+
lancedb_path: Path | str | None = None,
|
|
301
|
+
*,
|
|
302
|
+
manifest_table: Text = DEFAULT_MANIFEST_TABLE,
|
|
303
|
+
document_table: Text = DEFAULT_DOCUMENT_TABLE,
|
|
304
|
+
openai_embeddings_model: "AsyncOpenAIEmbeddingsModel",
|
|
305
|
+
model_settings: "ModelSettings",
|
|
306
|
+
token_secret_key: Text = "__lnclite__",
|
|
307
|
+
vector_search_prefer: VectorIndexPreference = "balanced",
|
|
308
|
+
verbose: bool = False,
|
|
309
|
+
):
|
|
310
|
+
self.lancedb_path = Path(lancedb_path)
|
|
311
|
+
self._connection: lancedb.AsyncConnection | None = None
|
|
312
|
+
|
|
313
|
+
self.manifest_table = manifest_table
|
|
314
|
+
self.document_table = document_table
|
|
315
|
+
|
|
316
|
+
self.openai_embeddings_model = openai_embeddings_model
|
|
317
|
+
self.model_settings = model_settings
|
|
318
|
+
self.max_tokens = self.openai_embeddings_model._max_input_tokens
|
|
319
|
+
|
|
320
|
+
self._secret_key = token_secret_key
|
|
321
|
+
self.vector_search_prefer = vector_search_prefer
|
|
322
|
+
self.verbose = verbose
|
|
323
|
+
|
|
324
|
+
if self.model_settings.dimensions is None:
|
|
325
|
+
raise ValueError("Model settings dimensions is not set")
|
|
326
|
+
self._document_lancedb_model: Type[LanceModel] = get_document_lancedb_model(
|
|
327
|
+
self.model_settings.dimensions
|
|
328
|
+
)
|
|
329
|
+
self._manifest_lancedb_model = ManifestLancedbModel
|
|
330
|
+
|
|
331
|
+
@classmethod
|
|
332
|
+
async def new(
|
|
333
|
+
cls,
|
|
334
|
+
lancedb_path: Path | str,
|
|
335
|
+
*,
|
|
336
|
+
manifest_table: Text = DEFAULT_MANIFEST_TABLE,
|
|
337
|
+
document_table: Text = DEFAULT_DOCUMENT_TABLE,
|
|
338
|
+
openai_embeddings_model: "AsyncOpenAIEmbeddingsModel",
|
|
339
|
+
model_settings: "ModelSettings",
|
|
340
|
+
token_secret_key: Text = "__lnclite__",
|
|
341
|
+
vector_search_prefer: VectorIndexPreference = "balanced",
|
|
342
|
+
verbose: bool = False,
|
|
343
|
+
) -> "Lnclite":
|
|
344
|
+
lancedb_path = Path(lancedb_path)
|
|
345
|
+
if lancedb_path.is_dir():
|
|
346
|
+
# If not a empty directory, raise an error
|
|
347
|
+
for _ in lancedb_path.iterdir():
|
|
348
|
+
raise ValueError(f"Lancedb path {lancedb_path} already exists ")
|
|
349
|
+
else:
|
|
350
|
+
lancedb_path.mkdir(parents=True, exist_ok=True)
|
|
351
|
+
|
|
352
|
+
if model_settings.dimensions is None:
|
|
353
|
+
model_settings.dimensions = await get_default_dimensions(
|
|
354
|
+
openai_embeddings_model
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
return cls(
|
|
358
|
+
lancedb_path=lancedb_path,
|
|
359
|
+
manifest_table=manifest_table,
|
|
360
|
+
document_table=document_table,
|
|
361
|
+
openai_embeddings_model=openai_embeddings_model,
|
|
362
|
+
model_settings=model_settings,
|
|
363
|
+
token_secret_key=token_secret_key,
|
|
364
|
+
vector_search_prefer=vector_search_prefer,
|
|
365
|
+
verbose=verbose,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
@classmethod
|
|
369
|
+
async def new_from_dir(
|
|
370
|
+
cls,
|
|
371
|
+
dir_path: Path | str,
|
|
372
|
+
lancedb_path: Path | str,
|
|
373
|
+
*,
|
|
374
|
+
dataset_name: Text,
|
|
375
|
+
dataset_description: Text,
|
|
376
|
+
manifest_table: Text = DEFAULT_MANIFEST_TABLE,
|
|
377
|
+
document_table: Text = DEFAULT_DOCUMENT_TABLE,
|
|
378
|
+
openai_embeddings_model: "AsyncOpenAIEmbeddingsModel",
|
|
379
|
+
model_settings: "ModelSettings",
|
|
380
|
+
extension_readers: Optional[Dict[str, "FileReader"]] = None,
|
|
381
|
+
batch_size: int = 100,
|
|
382
|
+
) -> "Lnclite":
|
|
383
|
+
from lnclite.file_ingestor import FileIngestor
|
|
384
|
+
|
|
385
|
+
dir_path = Path(dir_path)
|
|
386
|
+
if not dir_path.is_dir():
|
|
387
|
+
raise FileNotFoundError(f"Directory {dir_path} not found")
|
|
388
|
+
|
|
389
|
+
lancedb_path = Path(lancedb_path)
|
|
390
|
+
if lancedb_path.is_dir():
|
|
391
|
+
# If not a empty directory, raise an error
|
|
392
|
+
for _ in lancedb_path.iterdir():
|
|
393
|
+
raise ValueError(f"Lancedb path {lancedb_path} already exists ")
|
|
394
|
+
|
|
395
|
+
if model_settings.dimensions is None:
|
|
396
|
+
model_settings.dimensions = await get_default_dimensions(
|
|
397
|
+
openai_embeddings_model
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
lnclite = cls(
|
|
401
|
+
lancedb_path=lancedb_path,
|
|
402
|
+
manifest_table=manifest_table,
|
|
403
|
+
document_table=document_table,
|
|
404
|
+
openai_embeddings_model=openai_embeddings_model,
|
|
405
|
+
model_settings=model_settings,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# Create manifest
|
|
409
|
+
await lnclite.manifest.upsert(
|
|
410
|
+
name=dataset_name,
|
|
411
|
+
description=dataset_description,
|
|
412
|
+
model=openai_embeddings_model.model,
|
|
413
|
+
dimensions=model_settings.dimensions,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# Create documents
|
|
417
|
+
file_ingestor = FileIngestor()
|
|
418
|
+
if extension_readers is not None:
|
|
419
|
+
for extension, reader in extension_readers.items():
|
|
420
|
+
file_ingestor.register_reader(extension, reader)
|
|
421
|
+
|
|
422
|
+
batch: List[DocumentCreate] = []
|
|
423
|
+
async for file in file_ingestor.ingest_async(dir_path):
|
|
424
|
+
_file_content = file["content"].strip()
|
|
425
|
+
_file_path = str(file["path"])
|
|
426
|
+
|
|
427
|
+
if not _file_content:
|
|
428
|
+
logger.warning(f"Skipping {_file_path} due to empty content")
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
relative_path = Path(_file_path).relative_to(dir_path).as_posix()
|
|
432
|
+
batch.append(
|
|
433
|
+
DocumentCreate(
|
|
434
|
+
content=_file_content,
|
|
435
|
+
tags=[f"path:{relative_path}"],
|
|
436
|
+
)
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
if len(batch) >= batch_size:
|
|
440
|
+
await lnclite.documents.batch_create(batch)
|
|
441
|
+
logger.info(f"Created {len(batch)} documents")
|
|
442
|
+
batch = []
|
|
443
|
+
|
|
444
|
+
if batch:
|
|
445
|
+
await lnclite.documents.batch_create(batch)
|
|
446
|
+
logger.info(f"Created {len(batch)} documents")
|
|
447
|
+
|
|
448
|
+
await lnclite.documents.create_index()
|
|
449
|
+
|
|
450
|
+
return lnclite
|
|
451
|
+
|
|
452
|
+
@classmethod
|
|
453
|
+
async def load(
|
|
454
|
+
cls,
|
|
455
|
+
lancedb_path: Path | str,
|
|
456
|
+
*,
|
|
457
|
+
manifest_table: Text = DEFAULT_MANIFEST_TABLE,
|
|
458
|
+
document_table: Text = DEFAULT_DOCUMENT_TABLE,
|
|
459
|
+
openai_embeddings_model: "AsyncOpenAIEmbeddingsModel",
|
|
460
|
+
model_settings: "ModelSettings",
|
|
461
|
+
vector_search_prefer: VectorIndexPreference = "balanced",
|
|
462
|
+
refresh_index: bool = False,
|
|
463
|
+
verbose: bool = False,
|
|
464
|
+
) -> "Lnclite":
|
|
465
|
+
lancedb_path = Path(lancedb_path)
|
|
466
|
+
if not lancedb_path.is_dir():
|
|
467
|
+
raise FileNotFoundError(f"Lancedb path {lancedb_path} not found")
|
|
468
|
+
|
|
469
|
+
if model_settings.dimensions is None:
|
|
470
|
+
model_settings.dimensions = await get_default_dimensions(
|
|
471
|
+
openai_embeddings_model
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
lnclite = cls(
|
|
475
|
+
lancedb_path=lancedb_path,
|
|
476
|
+
manifest_table=manifest_table,
|
|
477
|
+
document_table=document_table,
|
|
478
|
+
openai_embeddings_model=openai_embeddings_model,
|
|
479
|
+
model_settings=model_settings,
|
|
480
|
+
vector_search_prefer=vector_search_prefer,
|
|
481
|
+
verbose=verbose,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Validate manifest
|
|
485
|
+
manifest = await lnclite.manifest.get()
|
|
486
|
+
if manifest is None:
|
|
487
|
+
raise LncliteNotFoundError("Manifest not found")
|
|
488
|
+
if manifest.model != openai_embeddings_model.model:
|
|
489
|
+
raise ValueError(
|
|
490
|
+
f"OpenAI embeddings model mismatch: {manifest.model} != {openai_embeddings_model.model}" # noqa: E501
|
|
491
|
+
)
|
|
492
|
+
if manifest.dimensions != model_settings.dimensions:
|
|
493
|
+
raise ValueError(
|
|
494
|
+
f"Model settings dimensions mismatch: {manifest.dimensions} != {model_settings.dimensions}" # noqa: E501
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
if refresh_index:
|
|
498
|
+
await lnclite.documents.create_index()
|
|
499
|
+
|
|
500
|
+
return lnclite
|
|
501
|
+
|
|
502
|
+
async def get_connection(self) -> lancedb.AsyncConnection:
|
|
503
|
+
if self._connection is None:
|
|
504
|
+
self._connection = await lancedb.connect_async(self.lancedb_path)
|
|
505
|
+
logger.info(f"Lancedb connected to {self.lancedb_path}")
|
|
506
|
+
return self._connection
|
|
507
|
+
|
|
508
|
+
@functools.cached_property
|
|
509
|
+
def manifest(self) -> "Manifest":
|
|
510
|
+
return Manifest(self)
|
|
511
|
+
|
|
512
|
+
@functools.cached_property
|
|
513
|
+
def documents(self) -> "Documents":
|
|
514
|
+
return Documents(self)
|
|
515
|
+
|
|
516
|
+
async def create_index(self) -> None:
|
|
517
|
+
await self.documents.create_index()
|
|
518
|
+
|
|
519
|
+
async def embed(self, texts: List[Text]) -> np.ndarray:
|
|
520
|
+
emb_res = await self.openai_embeddings_model.get_embeddings(
|
|
521
|
+
texts, model_settings=self.model_settings
|
|
522
|
+
)
|
|
523
|
+
return normalize(emb_res.to_numpy()) # (n, d)
|
|
524
|
+
|
|
525
|
+
async def search(
|
|
526
|
+
self,
|
|
527
|
+
query: Text,
|
|
528
|
+
*,
|
|
529
|
+
tags_any: Optional[List[Text]] = None,
|
|
530
|
+
tags_all: Optional[List[Text]] = None,
|
|
531
|
+
limit: int = 5,
|
|
532
|
+
verbose: bool = False,
|
|
533
|
+
) -> "SearchResults":
|
|
534
|
+
document_table = await self.documents.get_table()
|
|
535
|
+
|
|
536
|
+
query_vector = (await self.embed([query]))[0]
|
|
537
|
+
|
|
538
|
+
search_query = await document_table.search(query_vector)
|
|
539
|
+
tags_filter = _tags_filter(tags_any=tags_any, tags_all=tags_all)
|
|
540
|
+
if tags_filter is not None:
|
|
541
|
+
search_query = search_query.where(tags_filter)
|
|
542
|
+
|
|
543
|
+
if verbose or self.verbose:
|
|
544
|
+
logger.info(f"Query plan: {await search_query.explain_plan()}")
|
|
545
|
+
|
|
546
|
+
search_results: List[Dict] = (
|
|
547
|
+
await search_query.distance_type("dot").limit(limit).to_list()
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
results: List[SearchResult] = []
|
|
551
|
+
for result in search_results:
|
|
552
|
+
_doc = Document.model_validate(result)
|
|
553
|
+
_distance = result["_distance"]
|
|
554
|
+
results.append(SearchResult(document=_doc, distance=_distance))
|
|
555
|
+
|
|
556
|
+
return SearchResults(results=results)
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
class Manifest:
|
|
560
|
+
def __init__(self, client: "Lnclite"):
|
|
561
|
+
self.client = client
|
|
562
|
+
self._table: lancedb.AsyncTable | None = None
|
|
563
|
+
|
|
564
|
+
async def get_table(self) -> lancedb.AsyncTable:
|
|
565
|
+
if self._table is not None:
|
|
566
|
+
return self._table
|
|
567
|
+
|
|
568
|
+
conn = await self.client.get_connection()
|
|
569
|
+
if await _table_exists(conn, self.client.manifest_table):
|
|
570
|
+
self._table = await conn.open_table(self.client.manifest_table)
|
|
571
|
+
else:
|
|
572
|
+
self._table = await conn.create_table(
|
|
573
|
+
self.client.manifest_table, schema=self.client._manifest_lancedb_model
|
|
574
|
+
)
|
|
575
|
+
return self._table
|
|
576
|
+
|
|
577
|
+
async def get(self) -> ManifestModel | None:
|
|
578
|
+
manifest_table = await self.get_table()
|
|
579
|
+
_query_builder = manifest_table.query()
|
|
580
|
+
manifests = await _query_builder.limit(1).to_pydantic(
|
|
581
|
+
self.client._manifest_lancedb_model
|
|
582
|
+
)
|
|
583
|
+
if len(manifests) > 0:
|
|
584
|
+
return ManifestModel.model_validate_json(manifests[0].model_dump_json())
|
|
585
|
+
return None
|
|
586
|
+
|
|
587
|
+
async def retrieve(self) -> ManifestModel:
|
|
588
|
+
might_manifest = await self.get()
|
|
589
|
+
if might_manifest is not None:
|
|
590
|
+
return might_manifest
|
|
591
|
+
raise LncliteNotFoundError("Manifest not found")
|
|
592
|
+
|
|
593
|
+
async def upsert(
|
|
594
|
+
self,
|
|
595
|
+
*,
|
|
596
|
+
name: Text,
|
|
597
|
+
description: Text,
|
|
598
|
+
model: Text,
|
|
599
|
+
dimensions: int,
|
|
600
|
+
) -> ManifestModel:
|
|
601
|
+
table = await self.get_table()
|
|
602
|
+
might_manifest = await self.get()
|
|
603
|
+
|
|
604
|
+
if might_manifest is None:
|
|
605
|
+
manifest = self.client._manifest_lancedb_model(
|
|
606
|
+
name=name, description=description, model=model, dimensions=dimensions
|
|
607
|
+
)
|
|
608
|
+
await table.add([manifest])
|
|
609
|
+
|
|
610
|
+
else:
|
|
611
|
+
manifest = might_manifest
|
|
612
|
+
manifest.name = name
|
|
613
|
+
manifest.description = description
|
|
614
|
+
manifest.model = model
|
|
615
|
+
manifest.dimensions = dimensions
|
|
616
|
+
manifest.last_updated = int(time.time())
|
|
617
|
+
await table.update(where=f"id = {manifest.id}", values=manifest)
|
|
618
|
+
|
|
619
|
+
return ManifestModel.model_validate_json(manifest.model_dump_json())
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
class Documents:
|
|
623
|
+
def __init__(self, client: "Lnclite"):
|
|
624
|
+
self.client = client
|
|
625
|
+
self._table: lancedb.AsyncTable | None = None
|
|
626
|
+
|
|
627
|
+
async def get_table(self) -> lancedb.AsyncTable:
|
|
628
|
+
if self._table is not None:
|
|
629
|
+
return self._table
|
|
630
|
+
|
|
631
|
+
conn = await self.client.get_connection()
|
|
632
|
+
if await _table_exists(conn, self.client.document_table):
|
|
633
|
+
self._table = await conn.open_table(self.client.document_table)
|
|
634
|
+
else:
|
|
635
|
+
self._table = await conn.create_table(
|
|
636
|
+
self.client.document_table, schema=self.client._document_lancedb_model
|
|
637
|
+
)
|
|
638
|
+
return self._table
|
|
639
|
+
|
|
640
|
+
async def create_index(self) -> None:
|
|
641
|
+
table = await self.get_table()
|
|
642
|
+
|
|
643
|
+
await table.create_index("tags", config=lancedb.index.LabelList())
|
|
644
|
+
|
|
645
|
+
row_count = await self.count()
|
|
646
|
+
|
|
647
|
+
vs_config = recommended_vector_index_config(
|
|
648
|
+
row_count,
|
|
649
|
+
self.client.model_settings.dimensions,
|
|
650
|
+
prefer=self.client.vector_search_prefer,
|
|
651
|
+
)
|
|
652
|
+
if vs_config is None:
|
|
653
|
+
logger.info(
|
|
654
|
+
"Skipping vector index: row_count=%s is too small; brute-force search is exact and fast", # noqa: E501
|
|
655
|
+
row_count,
|
|
656
|
+
)
|
|
657
|
+
else:
|
|
658
|
+
await table.create_index("vector", config=vs_config)
|
|
659
|
+
logger.info(f"Created vector index with config: {vs_config}")
|
|
660
|
+
|
|
661
|
+
async def count(self) -> int:
|
|
662
|
+
document_table = await self.get_table()
|
|
663
|
+
return await document_table.count_rows()
|
|
664
|
+
|
|
665
|
+
async def get(self, id: int) -> Document | None:
|
|
666
|
+
document_table = await self.get_table()
|
|
667
|
+
documents = await (
|
|
668
|
+
document_table.query()
|
|
669
|
+
.where(f"id = {id}")
|
|
670
|
+
.limit(1)
|
|
671
|
+
.to_pydantic(self.client._document_lancedb_model)
|
|
672
|
+
)
|
|
673
|
+
if len(documents) > 0:
|
|
674
|
+
return Document.model_validate_json(documents[0].model_dump_json())
|
|
675
|
+
return None
|
|
676
|
+
|
|
677
|
+
async def retrieve(self, id: int) -> Document:
|
|
678
|
+
might_document = await self.get(id)
|
|
679
|
+
if might_document is not None:
|
|
680
|
+
return might_document
|
|
681
|
+
raise LncliteNotFoundError(f"Document with id {id} not found")
|
|
682
|
+
|
|
683
|
+
async def create(self, document_create: DocumentCreate) -> Document:
|
|
684
|
+
return (await self.batch_create([document_create]))[0]
|
|
685
|
+
|
|
686
|
+
async def batch_create(
|
|
687
|
+
self, document_creates: List[DocumentCreate]
|
|
688
|
+
) -> List[Document]:
|
|
689
|
+
document_table = await self.get_table()
|
|
690
|
+
|
|
691
|
+
normalized_vectors = await self.client.embed(
|
|
692
|
+
[d.content for d in document_creates]
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
documents = [
|
|
696
|
+
self.client._document_lancedb_model(
|
|
697
|
+
content=d.content, tags=d.tags, vector=v
|
|
698
|
+
)
|
|
699
|
+
for d, v in zip(document_creates, normalized_vectors)
|
|
700
|
+
]
|
|
701
|
+
|
|
702
|
+
await document_table.add(documents)
|
|
703
|
+
|
|
704
|
+
output: List[Document] = []
|
|
705
|
+
for document, v in zip(documents, normalized_vectors):
|
|
706
|
+
_doc = Document.model_validate_json(
|
|
707
|
+
document.model_dump_json(exclude_none=True)
|
|
708
|
+
)
|
|
709
|
+
_doc.vector = v.tolist()
|
|
710
|
+
output.append(_doc)
|
|
711
|
+
|
|
712
|
+
return output
|
|
713
|
+
|
|
714
|
+
async def list(
|
|
715
|
+
self,
|
|
716
|
+
*,
|
|
717
|
+
tags_any: Optional[List[Text]] = None,
|
|
718
|
+
tags_all: Optional[List[Text]] = None,
|
|
719
|
+
limit: int = 10,
|
|
720
|
+
order: ListOrder = "asc",
|
|
721
|
+
next_page_token: Optional[Text] = None,
|
|
722
|
+
verbose: bool = False,
|
|
723
|
+
) -> TokenPaginatic[Document]:
|
|
724
|
+
if limit < 1:
|
|
725
|
+
raise ValueError(f"Limit must be greater than 0, got {limit}")
|
|
726
|
+
|
|
727
|
+
sql_order = _to_sql_order(order)
|
|
728
|
+
id_operator = ">" if sql_order == "ASC" else "<"
|
|
729
|
+
after_id: Optional[int] = None
|
|
730
|
+
if next_page_token is not None:
|
|
731
|
+
decoded_token = decode_and_verify(next_page_token, self.client._secret_key)
|
|
732
|
+
after_id = decoded_token.get("after")
|
|
733
|
+
|
|
734
|
+
document_table = await self.client.documents.get_table()
|
|
735
|
+
|
|
736
|
+
# Prepare query
|
|
737
|
+
query_builder = document_table.query()
|
|
738
|
+
|
|
739
|
+
query_builder = query_builder.where(
|
|
740
|
+
_documents_list_where_clause(
|
|
741
|
+
id_operator=id_operator,
|
|
742
|
+
sql_order=sql_order,
|
|
743
|
+
after_id=after_id,
|
|
744
|
+
tags_any=tags_any,
|
|
745
|
+
tags_all=tags_all,
|
|
746
|
+
)
|
|
747
|
+
).limit(limit + 1)
|
|
748
|
+
if verbose or self.client.verbose:
|
|
749
|
+
logger.info(f"Query plan: {await query_builder.explain_plan()}")
|
|
750
|
+
|
|
751
|
+
# Execute query
|
|
752
|
+
documents = await query_builder.to_pydantic(self.client._document_lancedb_model)
|
|
753
|
+
|
|
754
|
+
has_more = len(documents) > limit
|
|
755
|
+
documents = documents[:limit]
|
|
756
|
+
|
|
757
|
+
_next_token = (
|
|
758
|
+
encode_and_sign({"after": documents[-1].id}, self.client._secret_key)
|
|
759
|
+
if has_more
|
|
760
|
+
else None
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
return TokenPaginatic(
|
|
764
|
+
object="list",
|
|
765
|
+
data=[Document.model_validate_json(d.model_dump_json()) for d in documents],
|
|
766
|
+
next_page_token=_next_token,
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
class SearchResult(BaseModel):
|
|
771
|
+
document: Document
|
|
772
|
+
distance: float
|
|
773
|
+
|
|
774
|
+
|
|
775
|
+
class SearchResults(BaseModel):
|
|
776
|
+
results: List[SearchResult]
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
class LncliteNotFoundError(Exception):
|
|
780
|
+
pass
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def _documents_list_where_clause(
|
|
784
|
+
*,
|
|
785
|
+
id_operator: Literal[">", "<"],
|
|
786
|
+
sql_order: SqlOrder,
|
|
787
|
+
after_id: Optional[int] = None,
|
|
788
|
+
tags_any: Optional[List[Text]] = None,
|
|
789
|
+
tags_all: Optional[List[Text]] = None,
|
|
790
|
+
) -> str:
|
|
791
|
+
id_filter = f"id {id_operator} {after_id}" if after_id is not None else "id > 0"
|
|
792
|
+
filters = [id_filter]
|
|
793
|
+
|
|
794
|
+
if tags_filter := _tags_filter(tags_any=tags_any, tags_all=tags_all):
|
|
795
|
+
filters.append(tags_filter)
|
|
796
|
+
|
|
797
|
+
where_clause = " AND ".join(f"({filter_})" for filter_ in filters)
|
|
798
|
+
return f"{where_clause} ORDER BY id {sql_order}"
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
def _tags_filter(
|
|
802
|
+
*,
|
|
803
|
+
tags_any: Optional[List[Text]] = None,
|
|
804
|
+
tags_all: Optional[List[Text]] = None,
|
|
805
|
+
) -> Optional[str]:
|
|
806
|
+
filters: List[str] = []
|
|
807
|
+
if tags_any:
|
|
808
|
+
filters.append(tag_filter_any(tags_any))
|
|
809
|
+
if tags_all:
|
|
810
|
+
filters.append(tag_filter_all(tags_all))
|
|
811
|
+
if not filters:
|
|
812
|
+
return None
|
|
813
|
+
return " AND ".join(f"({filter_})" for filter_ in filters)
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
def _to_sql_order(order: ListOrder) -> SqlOrder:
|
|
817
|
+
if order in ("asc", 1):
|
|
818
|
+
return "ASC"
|
|
819
|
+
if order in ("desc", -1):
|
|
820
|
+
return "DESC"
|
|
821
|
+
raise ValueError(f"Invalid order: {order}")
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
async def _table_exists(conn: lancedb.AsyncConnection, table_name: Text) -> bool:
|
|
825
|
+
table_list = await conn.list_tables()
|
|
826
|
+
return table_name in table_list.tables
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Directory crawling and text extraction from readable files.
|
|
2
|
+
|
|
3
|
+
Provides FilesIngestor for scanning trees while skipping binary and hidden paths.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import asyncio
|
|
7
|
+
import logging
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import (
|
|
10
|
+
AsyncGenerator,
|
|
11
|
+
Awaitable,
|
|
12
|
+
Callable,
|
|
13
|
+
Dict,
|
|
14
|
+
Generator,
|
|
15
|
+
TypeAlias,
|
|
16
|
+
TypedDict,
|
|
17
|
+
cast,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
DEFAULT_BINARY_PROBE_CHUNK_SIZE: int = 1024
|
|
23
|
+
|
|
24
|
+
FileReader: TypeAlias = Callable[[Path], str] | Callable[[Path], Awaitable[str]]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FileIngestorResult(TypedDict):
|
|
28
|
+
path: str
|
|
29
|
+
content: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class FileIngestor:
|
|
33
|
+
"""
|
|
34
|
+
A utility class to crawl directories and extract text content
|
|
35
|
+
from readable files while filtering out binary and hidden data.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self) -> None:
|
|
39
|
+
# Maps file extensions to specific parsing functions
|
|
40
|
+
self._custom_readers: Dict[str, FileReader] = {}
|
|
41
|
+
|
|
42
|
+
def register_reader(self, extension: str, reader_func: FileReader) -> None:
|
|
43
|
+
"""Registers a handler for a specific file extension (e.g., '.pdf')."""
|
|
44
|
+
self._custom_readers[extension.lower()] = reader_func
|
|
45
|
+
|
|
46
|
+
def ingest(self, dir_path: str) -> Generator[FileIngestorResult, None, None]:
|
|
47
|
+
"""
|
|
48
|
+
Iterates through the directory and yields documents as they are processed.
|
|
49
|
+
Yields:
|
|
50
|
+
Dict containing 'path' and 'content' keys.
|
|
51
|
+
"""
|
|
52
|
+
root = Path(dir_path)
|
|
53
|
+
|
|
54
|
+
for file_path in root.rglob("*"):
|
|
55
|
+
# Ensure it is a file and not an excluded path
|
|
56
|
+
if not file_path.is_file() or self._is_excluded(file_path):
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
extension = file_path.suffix.lower()
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
# 1. Use specialized reader if registered
|
|
63
|
+
if extension in self._custom_readers:
|
|
64
|
+
reader = self._custom_readers[extension]
|
|
65
|
+
if asyncio.iscoroutinefunction(reader):
|
|
66
|
+
logger.warning(
|
|
67
|
+
"Skipping %s: async reader requires ingest_async.",
|
|
68
|
+
file_path,
|
|
69
|
+
)
|
|
70
|
+
continue
|
|
71
|
+
sync_reader = cast(Callable[[Path], str], reader)
|
|
72
|
+
content = sync_reader(file_path)
|
|
73
|
+
yield FileIngestorResult(path=str(file_path), content=content)
|
|
74
|
+
|
|
75
|
+
# 2. Fallback to binary probe for generic text files
|
|
76
|
+
elif not self._is_binary(file_path):
|
|
77
|
+
content = self._read_text(file_path)
|
|
78
|
+
yield FileIngestorResult(path=str(file_path), content=content)
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.warning("Skipping %s due to error: %s", file_path, e)
|
|
82
|
+
|
|
83
|
+
async def ingest_async(
|
|
84
|
+
self, dir_path: str
|
|
85
|
+
) -> AsyncGenerator[FileIngestorResult, None]:
|
|
86
|
+
"""Async variant of ingest: walks the tree sync, reads in worker threads."""
|
|
87
|
+
root = Path(dir_path)
|
|
88
|
+
|
|
89
|
+
for file_path in root.rglob("*"):
|
|
90
|
+
if not file_path.is_file() or self._is_excluded(file_path):
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
extension = file_path.suffix.lower()
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
if extension in self._custom_readers:
|
|
97
|
+
reader = self._custom_readers[extension]
|
|
98
|
+
is_coro_reader = asyncio.iscoroutinefunction(reader)
|
|
99
|
+
if is_coro_reader:
|
|
100
|
+
content = await reader(file_path)
|
|
101
|
+
else:
|
|
102
|
+
content = reader(file_path)
|
|
103
|
+
yield FileIngestorResult(path=str(file_path), content=content)
|
|
104
|
+
elif not await asyncio.to_thread(self._is_binary, file_path):
|
|
105
|
+
content = await asyncio.to_thread(self._read_text, file_path)
|
|
106
|
+
yield FileIngestorResult(path=str(file_path), content=content)
|
|
107
|
+
except Exception as e:
|
|
108
|
+
logger.warning("Skipping %s due to error: %s", file_path, e)
|
|
109
|
+
|
|
110
|
+
def _is_binary(
|
|
111
|
+
self, file_path: Path, chunk_size: int = DEFAULT_BINARY_PROBE_CHUNK_SIZE
|
|
112
|
+
) -> bool:
|
|
113
|
+
"""Determines if a file is binary using null-byte detection and UTF-8 probing.""" # noqa: E501
|
|
114
|
+
try:
|
|
115
|
+
with open(file_path, "rb") as f:
|
|
116
|
+
chunk = f.read(chunk_size)
|
|
117
|
+
# Null bytes are standard in binary formats
|
|
118
|
+
if b"\0" in chunk:
|
|
119
|
+
return True
|
|
120
|
+
# Attempt decoding to verify it's a valid text format
|
|
121
|
+
chunk.decode("utf-8")
|
|
122
|
+
return False
|
|
123
|
+
except (UnicodeDecodeError, Exception):
|
|
124
|
+
return True
|
|
125
|
+
|
|
126
|
+
def _is_excluded(self, path: Path) -> bool:
|
|
127
|
+
"""Checks if any part of the file path starts with '.' or '_'."""
|
|
128
|
+
return any(part.startswith((".", "_")) for part in path.parts)
|
|
129
|
+
|
|
130
|
+
def _read_text(self, file_path: Path) -> str:
|
|
131
|
+
"""Reads plain text files with encoding safety."""
|
|
132
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
|
133
|
+
return f.read()
|
|
File without changes
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Index parameter helpers for LanceDB vector indexes."""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def calculate_index_params(n_rows: int, dimension: int) -> tuple[int, int]:
|
|
7
|
+
max_partitions = n_rows // 256
|
|
8
|
+
num_partitions = min(int(math.sqrt(n_rows) * 8), max_partitions)
|
|
9
|
+
num_partitions = max(num_partitions, 1)
|
|
10
|
+
|
|
11
|
+
target = dimension // 8
|
|
12
|
+
num_sub_vectors = 1
|
|
13
|
+
for i in range(target, 0, -1):
|
|
14
|
+
if dimension % i == 0:
|
|
15
|
+
num_sub_vectors = i
|
|
16
|
+
break
|
|
17
|
+
|
|
18
|
+
return num_partitions, num_sub_vectors
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import xxhash
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_folder_fingerprint(target_path: Path | str, read_content: bool = False) -> str:
|
|
8
|
+
# Build a single rolling hash with xxh64.
|
|
9
|
+
folder_hash = xxhash.xxh64()
|
|
10
|
+
|
|
11
|
+
# Keep traversal order stable; otherwise the same tree can hash differently
|
|
12
|
+
# if os.walk yields files in a different order.
|
|
13
|
+
for root, dirs, files in os.walk(target_path):
|
|
14
|
+
for names in sorted(files):
|
|
15
|
+
file_path = os.path.join(root, names)
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
# 1. Mix in relative path (detects moves/renames).
|
|
19
|
+
rel_path = os.path.relpath(file_path, target_path)
|
|
20
|
+
folder_hash.update(rel_path.encode())
|
|
21
|
+
|
|
22
|
+
# 2. Mix in file metadata — fast default.
|
|
23
|
+
# Use content hashing only if you need stronger accuracy
|
|
24
|
+
# and can pay the I/O cost.
|
|
25
|
+
if read_content:
|
|
26
|
+
# Catches changes where mtime/size might not move (rare edge cases).
|
|
27
|
+
with open(file_path, "rb") as f:
|
|
28
|
+
folder_hash.update(f.read())
|
|
29
|
+
else:
|
|
30
|
+
stat = os.stat(file_path)
|
|
31
|
+
folder_hash.update(str(stat.st_mtime).encode()) # modification time
|
|
32
|
+
folder_hash.update(str(stat.st_size).encode()) # file size
|
|
33
|
+
|
|
34
|
+
except (PermissionError, FileNotFoundError):
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
return folder_hash.hexdigest()
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import os
|
|
3
|
+
import socket
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
_global_generator: Optional["Snowflake"] = None
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Snowflake:
|
|
12
|
+
def __init__(self, worker_id: int):
|
|
13
|
+
self.worker_id = worker_id
|
|
14
|
+
self.sequence = 0
|
|
15
|
+
self.last_timestamp = -1
|
|
16
|
+
|
|
17
|
+
self.twepoch = 1704067200000 # 2024-01-01
|
|
18
|
+
|
|
19
|
+
# Define the bit lengths of each part
|
|
20
|
+
self.worker_id_bits = 10
|
|
21
|
+
self.sequence_bits = 12
|
|
22
|
+
|
|
23
|
+
# Calculate the maximum value
|
|
24
|
+
self.max_worker_id = -1 ^ (-1 << self.worker_id_bits)
|
|
25
|
+
|
|
26
|
+
# Shift amounts
|
|
27
|
+
self.worker_id_shift = self.sequence_bits
|
|
28
|
+
self.timestamp_left_shift = self.sequence_bits + self.worker_id_bits
|
|
29
|
+
self.sequence_mask = -1 ^ (-1 << self.sequence_bits)
|
|
30
|
+
|
|
31
|
+
self.lock = threading.Lock()
|
|
32
|
+
|
|
33
|
+
def _get_timestamp(self):
|
|
34
|
+
return int(time.time() * 1000)
|
|
35
|
+
|
|
36
|
+
def generate(self) -> int:
|
|
37
|
+
with self.lock:
|
|
38
|
+
timestamp = self._get_timestamp()
|
|
39
|
+
|
|
40
|
+
if timestamp < self.last_timestamp:
|
|
41
|
+
raise Exception("Clock backward exception")
|
|
42
|
+
|
|
43
|
+
if timestamp == self.last_timestamp:
|
|
44
|
+
# Within the same millisecond, the sequence number increases
|
|
45
|
+
self.sequence = (self.sequence + 1) & self.sequence_mask
|
|
46
|
+
if self.sequence == 0:
|
|
47
|
+
# If the sequence number is exhausted, wait for the next millisecond
|
|
48
|
+
while timestamp <= self.last_timestamp:
|
|
49
|
+
timestamp = self._get_timestamp()
|
|
50
|
+
else:
|
|
51
|
+
self.sequence = 0
|
|
52
|
+
|
|
53
|
+
self.last_timestamp = timestamp
|
|
54
|
+
|
|
55
|
+
# Combine the parts and perform bitwise left shift
|
|
56
|
+
new_id = (
|
|
57
|
+
((timestamp - self.twepoch) << self.timestamp_left_shift)
|
|
58
|
+
| (self.worker_id << self.worker_id_shift)
|
|
59
|
+
| self.sequence
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
return new_id
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_valid_worker_id(max_bits: int = 10) -> int:
|
|
66
|
+
max_worker_id = (1 << max_bits) - 1 # Result of 1023
|
|
67
|
+
worker_id_str = os.getenv("WORKER_ID")
|
|
68
|
+
|
|
69
|
+
# Case 1 and Case 2: Environment variable has value
|
|
70
|
+
if worker_id_str:
|
|
71
|
+
try:
|
|
72
|
+
# Case 1: Standard numeric input
|
|
73
|
+
worker_id = int(worker_id_str)
|
|
74
|
+
# Use modulo to force the number to converge within the valid range
|
|
75
|
+
return abs(worker_id) % (max_worker_id + 1)
|
|
76
|
+
except ValueError:
|
|
77
|
+
# Case 2: Input non-numeric string (e.g. "app-worker-a")
|
|
78
|
+
# Use MD5 hash to convert the string to a large integer,
|
|
79
|
+
# then perform modulo convergence
|
|
80
|
+
hash_int = int(hashlib.md5(worker_id_str.encode("utf-8")).hexdigest(), 16)
|
|
81
|
+
return hash_int % (max_worker_id + 1)
|
|
82
|
+
|
|
83
|
+
# Case 3: No WORKER_ID is set (forced fallback)
|
|
84
|
+
# Get the hostname of the machine as the唯一性依據
|
|
85
|
+
hostname = socket.gethostname()
|
|
86
|
+
hash_int = int(hashlib.md5(hostname.encode("utf-8")).hexdigest(), 16)
|
|
87
|
+
|
|
88
|
+
return hash_int % (max_worker_id + 1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def generate_id() -> int:
|
|
92
|
+
global _global_generator
|
|
93
|
+
if _global_generator is None:
|
|
94
|
+
_global_generator = Snowflake(worker_id=get_valid_worker_id())
|
|
95
|
+
return _global_generator.generate()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
authors = [{ name = "Allen Chou", email = "f1470891079@gmail.com" }]
|
|
3
|
+
dependencies = [
|
|
4
|
+
"lancedb",
|
|
5
|
+
"openai",
|
|
6
|
+
"openai-embeddings-model",
|
|
7
|
+
"paginatic",
|
|
8
|
+
"pydantic (>=2)",
|
|
9
|
+
"xxhash",
|
|
10
|
+
]
|
|
11
|
+
description = "Lite usages of lancedb."
|
|
12
|
+
license = "MIT"
|
|
13
|
+
license-files = ["LICENSE"]
|
|
14
|
+
name = "lnclite"
|
|
15
|
+
readme = "README.md"
|
|
16
|
+
requires-python = ">=3.11,<4"
|
|
17
|
+
version = "0.1.0"
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Homepage = "https://github.com/allen2c/lnclite"
|
|
21
|
+
"PyPI" = "https://pypi.org/project/lnclite/"
|
|
22
|
+
Repository = "https://github.com/allen2c/lnclite"
|
|
23
|
+
|
|
24
|
+
[tool.poetry]
|
|
25
|
+
packages = [{ include = "lnclite" }]
|
|
26
|
+
|
|
27
|
+
[tool.poetry.group.dev.dependencies]
|
|
28
|
+
black = { extras = ["jupyter"], version = "*" }
|
|
29
|
+
isort = "*"
|
|
30
|
+
lines-of-work = { git = "https://github.com/allen2c/lines-of-work.git" }
|
|
31
|
+
logging-bullet-train = ">=0.4.0"
|
|
32
|
+
mkdocs-material = "*"
|
|
33
|
+
poetry-plugin-export = "*"
|
|
34
|
+
pytest = "*"
|
|
35
|
+
pytest-asyncio = "*"
|
|
36
|
+
pytest-cov = "*"
|
|
37
|
+
pytest-env = "*"
|
|
38
|
+
pytest-xdist = "*"
|
|
39
|
+
rich = "*"
|
|
40
|
+
rich-color-support = "*"
|
|
41
|
+
setuptools = "*"
|
|
42
|
+
twine = "*"
|
|
43
|
+
|
|
44
|
+
[tool.isort]
|
|
45
|
+
profile = "black"
|
|
46
|
+
|
|
47
|
+
[tool.black]
|
|
48
|
+
target-version = ["py311"]
|
|
49
|
+
|
|
50
|
+
[tool.flake8]
|
|
51
|
+
ignore = ["E203", "E704", "W503"]
|
|
52
|
+
max-line-length = 88
|
|
53
|
+
|
|
54
|
+
[tool.pytest.ini_options]
|
|
55
|
+
env = ["ENVIRONMENT=test", "PYTEST_IS_RUNNING=true"]
|
|
56
|
+
|
|
57
|
+
[build-system]
|
|
58
|
+
build-backend = "poetry.core.masonry.api"
|
|
59
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|