lnclite 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lnclite-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 AllenChou
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
lnclite-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: lnclite
3
+ Version: 0.1.0
4
+ Summary: Lite usages of lancedb.
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Author: Allen Chou
8
+ Author-email: f1470891079@gmail.com
9
+ Requires-Python: >=3.11,<4
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
15
+ Requires-Dist: lancedb
16
+ Requires-Dist: openai
17
+ Requires-Dist: openai-embeddings-model
18
+ Requires-Dist: paginatic
19
+ Requires-Dist: pydantic (>=2)
20
+ Requires-Dist: xxhash
21
+ Project-URL: Homepage, https://github.com/allen2c/lnclite
22
+ Project-URL: PyPI, https://pypi.org/project/lnclite/
23
+ Project-URL: Repository, https://github.com/allen2c/lnclite
24
+ Description-Content-Type: text/markdown
25
+
26
+ # lnclite
27
+
28
+ `lnclite` is a small async LanceDB document store for OpenAI-compatible embeddings. It gives you a compact API for creating a local vector database, adding documents, filtering by tags, and running semantic search.
29
+
30
+ ## Installation
31
+
32
+ ```bash
33
+ pip install lnclite
34
+ ```
35
+
36
+ For local development from this repository:
37
+
38
+ ```bash
39
+ poetry install --all-groups
40
+ ```
41
+
42
+ ## Quick Start
43
+
44
+ ```python
45
+ import asyncio
46
+
47
+ from openai import AsyncOpenAI
48
+ from openai_embeddings_model import ModelSettings
49
+
50
+ from lnclite import DocumentCreate, Lnclite, get_openai_embeddings_model
51
+
52
+
53
+ async def main():
54
+ embeddings = get_openai_embeddings_model(
55
+ openai_client=AsyncOpenAI(),
56
+ )
57
+
58
+ client = await Lnclite.new(
59
+ lancedb_path="outputs/demo.lance",
60
+ openai_embeddings_model=embeddings,
61
+ model_settings=ModelSettings(dimensions=1536),
62
+ )
63
+
64
+ await client.documents.batch_create(
65
+ [
66
+ DocumentCreate(
67
+ content="A note about async Python clients.",
68
+ tags=["type:note", "topic:python"],
69
+ ),
70
+ DocumentCreate(
71
+ content="A note about vector search and indexing.",
72
+ tags=["type:note", "topic:search"],
73
+ ),
74
+ ]
75
+ )
76
+
77
+ await client.create_index()
78
+
79
+ results = await client.search(
80
+ "How should I design vector search?",
81
+ tags_any=["topic:search"],
82
+ )
83
+
84
+ for result in results.results:
85
+ print(result.document.content)
86
+ print(result.document.tags)
87
+ print(result.distance)
88
+
89
+
90
+ if __name__ == "__main__":
91
+ asyncio.run(main())
92
+ ```
93
+
94
+ ## Documentation
95
+
96
+ Full documentation is published with MkDocs Material from this repository's `docs/` directory.
97
+
98
+ ## License
99
+
100
+ MIT
101
+
@@ -0,0 +1,75 @@
1
+ # lnclite
2
+
3
+ `lnclite` is a small async LanceDB document store for OpenAI-compatible embeddings. It gives you a compact API for creating a local vector database, adding documents, filtering by tags, and running semantic search.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install lnclite
9
+ ```
10
+
11
+ For local development from this repository:
12
+
13
+ ```bash
14
+ poetry install --all-groups
15
+ ```
16
+
17
+ ## Quick Start
18
+
19
+ ```python
20
+ import asyncio
21
+
22
+ from openai import AsyncOpenAI
23
+ from openai_embeddings_model import ModelSettings
24
+
25
+ from lnclite import DocumentCreate, Lnclite, get_openai_embeddings_model
26
+
27
+
28
+ async def main():
29
+ embeddings = get_openai_embeddings_model(
30
+ openai_client=AsyncOpenAI(),
31
+ )
32
+
33
+ client = await Lnclite.new(
34
+ lancedb_path="outputs/demo.lance",
35
+ openai_embeddings_model=embeddings,
36
+ model_settings=ModelSettings(dimensions=1536),
37
+ )
38
+
39
+ await client.documents.batch_create(
40
+ [
41
+ DocumentCreate(
42
+ content="A note about async Python clients.",
43
+ tags=["type:note", "topic:python"],
44
+ ),
45
+ DocumentCreate(
46
+ content="A note about vector search and indexing.",
47
+ tags=["type:note", "topic:search"],
48
+ ),
49
+ ]
50
+ )
51
+
52
+ await client.create_index()
53
+
54
+ results = await client.search(
55
+ "How should I design vector search?",
56
+ tags_any=["topic:search"],
57
+ )
58
+
59
+ for result in results.results:
60
+ print(result.document.content)
61
+ print(result.document.tags)
62
+ print(result.distance)
63
+
64
+
65
+ if __name__ == "__main__":
66
+ asyncio.run(main())
67
+ ```
68
+
69
+ ## Documentation
70
+
71
+ Full documentation is published with MkDocs Material from this repository's `docs/` directory.
72
+
73
+ ## License
74
+
75
+ MIT
@@ -0,0 +1,826 @@
1
+ """Small async LanceDB document store with OpenAI embeddings."""
2
+
3
+ import functools
4
+ import hashlib
5
+ import logging
6
+ import time
7
+ from pathlib import Path
8
+ from typing import (
9
+ TYPE_CHECKING,
10
+ Dict,
11
+ Final,
12
+ List,
13
+ Literal,
14
+ Optional,
15
+ Text,
16
+ Type,
17
+ )
18
+
19
+ import diskcache
20
+ import lancedb
21
+ import lancedb.index
22
+ import numpy as np
23
+ import tiktoken
24
+ from lancedb.pydantic import LanceModel, Vector
25
+ from openai import AsyncOpenAI
26
+ from openai_embeddings_model import MAX_BATCH_SIZE as DEFAULT_EMBEDDINGS_MAX_BATCH_SIZE
27
+ from openai_embeddings_model import (
28
+ AsyncOpenAIEmbeddingsModel,
29
+ ModelSettings,
30
+ )
31
+ from openai_embeddings_model.normalize import normalize
32
+ from paginatic import TokenPaginatic
33
+ from paginatic.helpers import decode_and_verify, encode_and_sign
34
+ from pydantic import BaseModel, Field, model_validator
35
+
36
+ if TYPE_CHECKING:
37
+ from lnclite.file_ingestor import FileReader
38
+
39
+ __version__: Final[Text] = "0.1.0"
40
+ __all__: Final[List[Text]] = [
41
+ "Document",
42
+ "DocumentCreate",
43
+ "Lnclite",
44
+ "LncliteNotFoundError",
45
+ "ManifestModel",
46
+ "SearchResult",
47
+ "SearchResults",
48
+ "get_model_settings",
49
+ "get_openai_embeddings_model",
50
+ ]
51
+
52
+ logger = logging.getLogger(__name__)
53
+
54
+ DEFAULT_MANIFEST_TABLE = "manifest"
55
+ DEFAULT_DOCUMENT_TABLE = "documents"
56
+ DEFAULT_OPENAI_MODEL = "text-embedding-3-small"
57
+ DEFAULT_MAX_INPUT_TOKENS = 4096
58
+ DEFAULT_DIMENSIONS = 1536
59
+
60
+ VectorIndexPreference = Literal["storage", "balanced", "accuracy", "latency"]
61
+ ListOrder = Literal["asc", "desc", 1, -1]
62
+ SqlOrder = Literal["ASC", "DESC"]
63
+
64
+
65
+ def gen_id() -> str:
66
+ from lnclite.utils.snowflake import generate_id
67
+
68
+ return generate_id()
69
+
70
+
71
+ @functools.cache
72
+ def get_document_lancedb_model(dim: int) -> Type[LanceModel]:
73
+ class DocumentLancedbModel(LanceModel):
74
+ id: int = Field(default_factory=gen_id)
75
+ content: Text = Field(description="The content of the document.") # noqa: E501
76
+ md5: Text = ""
77
+ vector: Vector(dim)
78
+ tags: List[Text] = Field(default_factory=list)
79
+
80
+ @model_validator(mode="after")
81
+ def validate_values(self) -> "DocumentLancedbModel":
82
+ self.content = self.content.strip()
83
+ if not self.content:
84
+ raise ValueError("Content cannot be empty")
85
+ self.md5 = hashlib.md5(self.content.encode()).hexdigest()
86
+ return self
87
+
88
+ return DocumentLancedbModel
89
+
90
+
91
+ @functools.cache
92
+ def get_openai_client() -> AsyncOpenAI:
93
+ return AsyncOpenAI()
94
+
95
+
96
+ @functools.cache
97
+ def get_embeddings_cache() -> diskcache.Cache:
98
+ return diskcache.Cache(".cache/embeddings")
99
+
100
+
101
+ @functools.cache
102
+ def get_encoding_for_model(model: str) -> tiktoken.Encoding:
103
+ try:
104
+ return tiktoken.encoding_for_model(model)
105
+ except KeyError:
106
+ logger.warning(
107
+ f"Encoding for model {model} not found, using default encoding 'gpt-5"
108
+ )
109
+ return tiktoken.encoding_for_model("gpt-5")
110
+
111
+
112
+ def get_openai_embeddings_model(
113
+ model: str = DEFAULT_OPENAI_MODEL,
114
+ openai_client: AsyncOpenAI | None = None,
115
+ cache: diskcache.Cache | None = None,
116
+ encoding: tiktoken.Encoding | None = None,
117
+ max_batch_size: int = DEFAULT_EMBEDDINGS_MAX_BATCH_SIZE,
118
+ max_input_tokens: int = DEFAULT_MAX_INPUT_TOKENS,
119
+ ) -> "AsyncOpenAIEmbeddingsModel":
120
+ return AsyncOpenAIEmbeddingsModel(
121
+ model=model,
122
+ openai_client=openai_client or get_openai_client(),
123
+ cache=cache or get_embeddings_cache(),
124
+ encoding=encoding or get_encoding_for_model(model),
125
+ max_batch_size=max_batch_size,
126
+ max_input_tokens=max_input_tokens,
127
+ )
128
+
129
+
130
+ def get_model_settings(dimensions: int = DEFAULT_DIMENSIONS) -> "ModelSettings":
131
+ return ModelSettings(dimensions=dimensions)
132
+
133
+
134
+ def quote_sql_string(s: str) -> str:
135
+ return "'" + s.replace("'", "''") + "'"
136
+
137
+
138
+ def tag_filter_any(tags: list[str]) -> str:
139
+ values = ", ".join(quote_sql_string(tag) for tag in tags)
140
+ return f"array_has_any(tags, [{values}])"
141
+
142
+
143
+ def tag_filter_all(tags: list[str]) -> str:
144
+ values = ", ".join(quote_sql_string(tag) for tag in tags)
145
+ return f"array_has_all(tags, [{values}])"
146
+
147
+
148
+ def recommended_vector_index_config(
149
+ row_count: int,
150
+ dim: int,
151
+ *,
152
+ prefer: VectorIndexPreference = "balanced",
153
+ ):
154
+ """Return a LanceDB vector index config for dot-search.
155
+
156
+ Assumes document vectors and query vectors are normalized.
157
+ Returns None when brute-force is better or when there are not enough rows.
158
+ """
159
+
160
+ # Too small. Brute-force is exact and fast.
161
+ # Also avoids PQ training errors.
162
+ if row_count < 256:
163
+ return None
164
+
165
+ # Still small. Brute-force is usually fine.
166
+ # If you really want an index, IvfFlat is safer than PQ.
167
+ if row_count < 10_000:
168
+ if prefer in {"accuracy", "latency"}:
169
+ return lancedb.index.IvfFlat(
170
+ distance_type="dot",
171
+ num_partitions=32,
172
+ )
173
+ return None
174
+
175
+ if row_count < 50_000:
176
+ return lancedb.index.IvfFlat(
177
+ distance_type="dot",
178
+ num_partitions=128,
179
+ )
180
+
181
+ if row_count < 100_000:
182
+ if prefer == "storage":
183
+ return lancedb.index.IvfPq(
184
+ distance_type="dot",
185
+ num_partitions=256,
186
+ num_sub_vectors=recommended_num_sub_vectors(dim, prefer),
187
+ num_bits=8,
188
+ )
189
+
190
+ return lancedb.index.IvfFlat(
191
+ distance_type="dot",
192
+ num_partitions=256,
193
+ )
194
+
195
+ if row_count < 500_000:
196
+ return lancedb.index.IvfPq(
197
+ distance_type="dot",
198
+ num_partitions=1024 if prefer == "storage" else 2048,
199
+ num_sub_vectors=recommended_num_sub_vectors(dim, prefer),
200
+ num_bits=8,
201
+ )
202
+
203
+ if row_count < 1_000_000:
204
+ return lancedb.index.IvfPq(
205
+ distance_type="dot",
206
+ num_partitions=2048 if prefer == "storage" else 4096,
207
+ num_sub_vectors=recommended_num_sub_vectors(dim, prefer),
208
+ num_bits=8,
209
+ )
210
+
211
+ if prefer == "latency":
212
+ return lancedb.index.HnswPq(
213
+ distance_type="dot",
214
+ m=20,
215
+ ef_construction=300,
216
+ num_sub_vectors=recommended_num_sub_vectors(dim, "balanced"),
217
+ num_bits=8,
218
+ )
219
+
220
+ return lancedb.index.IvfPq(
221
+ distance_type="dot",
222
+ num_partitions=4096 if prefer in {"storage", "balanced"} else 8192,
223
+ num_sub_vectors=recommended_num_sub_vectors(dim, prefer),
224
+ num_bits=8,
225
+ )
226
+
227
+
228
+ def recommended_num_sub_vectors(
229
+ dim: int, prefer: VectorIndexPreference = "balanced"
230
+ ) -> int:
231
+ """Return a PQ subvector count.
232
+
233
+ Higher = more accurate, larger index.
234
+ Lower = more compressed, lower recall.
235
+ """
236
+
237
+ # Prefer subvector sizes around 8~16 dimensions.
238
+ if prefer == "storage":
239
+ target_sub_dim = 16
240
+ elif prefer == "accuracy":
241
+ target_sub_dim = 8
242
+ else:
243
+ target_sub_dim = 12
244
+
245
+ candidates = [x for x in range(1, dim + 1) if dim % x == 0]
246
+ return min(candidates, key=lambda x: abs((dim / x) - target_sub_dim))
247
+
248
+
249
+ async def get_default_dimensions(
250
+ openai_embeddings_model: AsyncOpenAIEmbeddingsModel,
251
+ ) -> int:
252
+ emb_result = await openai_embeddings_model.get_embeddings(
253
+ input="Hello, world!",
254
+ model_settings=ModelSettings(),
255
+ )
256
+ return emb_result.to_numpy().shape[1]
257
+
258
+
259
+ class ManifestLancedbModel(LanceModel):
260
+ id: int = Field(default_factory=gen_id)
261
+ name: Text = Field(description="The name of the database.")
262
+ description: Text = Field(description="The description of the database.")
263
+ model: Text = Field(description="The embedding model name.")
264
+ dimensions: int = Field(description="The dimensions of the embeddings.")
265
+ last_updated: int = Field(default_factory=lambda: int(time.time()))
266
+
267
+
268
+ class ManifestModel(LanceModel):
269
+ id: int
270
+ name: Text
271
+ description: Text
272
+ model: Text
273
+ dimensions: int
274
+ last_updated: int
275
+
276
+
277
+ class DocumentCreate(BaseModel):
278
+ content: Text
279
+ tags: List[Text] = Field(default_factory=list)
280
+
281
+ @model_validator(mode="after")
282
+ def validate_values(self) -> "DocumentCreate":
283
+ self.content = self.content.strip()
284
+ if not self.content:
285
+ raise ValueError("Content cannot be empty")
286
+ return self
287
+
288
+
289
+ class Document(BaseModel):
290
+ id: int
291
+ content: Text
292
+ md5: Text
293
+ vector: Optional[List[float]]
294
+ tags: List[Text]
295
+
296
+
297
+ class Lnclite:
298
+ def __init__(
299
+ self,
300
+ lancedb_path: Path | str | None = None,
301
+ *,
302
+ manifest_table: Text = DEFAULT_MANIFEST_TABLE,
303
+ document_table: Text = DEFAULT_DOCUMENT_TABLE,
304
+ openai_embeddings_model: "AsyncOpenAIEmbeddingsModel",
305
+ model_settings: "ModelSettings",
306
+ token_secret_key: Text = "__lnclite__",
307
+ vector_search_prefer: VectorIndexPreference = "balanced",
308
+ verbose: bool = False,
309
+ ):
310
+ self.lancedb_path = Path(lancedb_path)
311
+ self._connection: lancedb.AsyncConnection | None = None
312
+
313
+ self.manifest_table = manifest_table
314
+ self.document_table = document_table
315
+
316
+ self.openai_embeddings_model = openai_embeddings_model
317
+ self.model_settings = model_settings
318
+ self.max_tokens = self.openai_embeddings_model._max_input_tokens
319
+
320
+ self._secret_key = token_secret_key
321
+ self.vector_search_prefer = vector_search_prefer
322
+ self.verbose = verbose
323
+
324
+ if self.model_settings.dimensions is None:
325
+ raise ValueError("Model settings dimensions is not set")
326
+ self._document_lancedb_model: Type[LanceModel] = get_document_lancedb_model(
327
+ self.model_settings.dimensions
328
+ )
329
+ self._manifest_lancedb_model = ManifestLancedbModel
330
+
331
+ @classmethod
332
+ async def new(
333
+ cls,
334
+ lancedb_path: Path | str,
335
+ *,
336
+ manifest_table: Text = DEFAULT_MANIFEST_TABLE,
337
+ document_table: Text = DEFAULT_DOCUMENT_TABLE,
338
+ openai_embeddings_model: "AsyncOpenAIEmbeddingsModel",
339
+ model_settings: "ModelSettings",
340
+ token_secret_key: Text = "__lnclite__",
341
+ vector_search_prefer: VectorIndexPreference = "balanced",
342
+ verbose: bool = False,
343
+ ) -> "Lnclite":
344
+ lancedb_path = Path(lancedb_path)
345
+ if lancedb_path.is_dir():
346
+ # If not a empty directory, raise an error
347
+ for _ in lancedb_path.iterdir():
348
+ raise ValueError(f"Lancedb path {lancedb_path} already exists ")
349
+ else:
350
+ lancedb_path.mkdir(parents=True, exist_ok=True)
351
+
352
+ if model_settings.dimensions is None:
353
+ model_settings.dimensions = await get_default_dimensions(
354
+ openai_embeddings_model
355
+ )
356
+
357
+ return cls(
358
+ lancedb_path=lancedb_path,
359
+ manifest_table=manifest_table,
360
+ document_table=document_table,
361
+ openai_embeddings_model=openai_embeddings_model,
362
+ model_settings=model_settings,
363
+ token_secret_key=token_secret_key,
364
+ vector_search_prefer=vector_search_prefer,
365
+ verbose=verbose,
366
+ )
367
+
368
+ @classmethod
369
+ async def new_from_dir(
370
+ cls,
371
+ dir_path: Path | str,
372
+ lancedb_path: Path | str,
373
+ *,
374
+ dataset_name: Text,
375
+ dataset_description: Text,
376
+ manifest_table: Text = DEFAULT_MANIFEST_TABLE,
377
+ document_table: Text = DEFAULT_DOCUMENT_TABLE,
378
+ openai_embeddings_model: "AsyncOpenAIEmbeddingsModel",
379
+ model_settings: "ModelSettings",
380
+ extension_readers: Optional[Dict[str, "FileReader"]] = None,
381
+ batch_size: int = 100,
382
+ ) -> "Lnclite":
383
+ from lnclite.file_ingestor import FileIngestor
384
+
385
+ dir_path = Path(dir_path)
386
+ if not dir_path.is_dir():
387
+ raise FileNotFoundError(f"Directory {dir_path} not found")
388
+
389
+ lancedb_path = Path(lancedb_path)
390
+ if lancedb_path.is_dir():
391
+ # If not a empty directory, raise an error
392
+ for _ in lancedb_path.iterdir():
393
+ raise ValueError(f"Lancedb path {lancedb_path} already exists ")
394
+
395
+ if model_settings.dimensions is None:
396
+ model_settings.dimensions = await get_default_dimensions(
397
+ openai_embeddings_model
398
+ )
399
+
400
+ lnclite = cls(
401
+ lancedb_path=lancedb_path,
402
+ manifest_table=manifest_table,
403
+ document_table=document_table,
404
+ openai_embeddings_model=openai_embeddings_model,
405
+ model_settings=model_settings,
406
+ )
407
+
408
+ # Create manifest
409
+ await lnclite.manifest.upsert(
410
+ name=dataset_name,
411
+ description=dataset_description,
412
+ model=openai_embeddings_model.model,
413
+ dimensions=model_settings.dimensions,
414
+ )
415
+
416
+ # Create documents
417
+ file_ingestor = FileIngestor()
418
+ if extension_readers is not None:
419
+ for extension, reader in extension_readers.items():
420
+ file_ingestor.register_reader(extension, reader)
421
+
422
+ batch: List[DocumentCreate] = []
423
+ async for file in file_ingestor.ingest_async(dir_path):
424
+ _file_content = file["content"].strip()
425
+ _file_path = str(file["path"])
426
+
427
+ if not _file_content:
428
+ logger.warning(f"Skipping {_file_path} due to empty content")
429
+ continue
430
+
431
+ relative_path = Path(_file_path).relative_to(dir_path).as_posix()
432
+ batch.append(
433
+ DocumentCreate(
434
+ content=_file_content,
435
+ tags=[f"path:{relative_path}"],
436
+ )
437
+ )
438
+
439
+ if len(batch) >= batch_size:
440
+ await lnclite.documents.batch_create(batch)
441
+ logger.info(f"Created {len(batch)} documents")
442
+ batch = []
443
+
444
+ if batch:
445
+ await lnclite.documents.batch_create(batch)
446
+ logger.info(f"Created {len(batch)} documents")
447
+
448
+ await lnclite.documents.create_index()
449
+
450
+ return lnclite
451
+
452
+ @classmethod
453
+ async def load(
454
+ cls,
455
+ lancedb_path: Path | str,
456
+ *,
457
+ manifest_table: Text = DEFAULT_MANIFEST_TABLE,
458
+ document_table: Text = DEFAULT_DOCUMENT_TABLE,
459
+ openai_embeddings_model: "AsyncOpenAIEmbeddingsModel",
460
+ model_settings: "ModelSettings",
461
+ vector_search_prefer: VectorIndexPreference = "balanced",
462
+ refresh_index: bool = False,
463
+ verbose: bool = False,
464
+ ) -> "Lnclite":
465
+ lancedb_path = Path(lancedb_path)
466
+ if not lancedb_path.is_dir():
467
+ raise FileNotFoundError(f"Lancedb path {lancedb_path} not found")
468
+
469
+ if model_settings.dimensions is None:
470
+ model_settings.dimensions = await get_default_dimensions(
471
+ openai_embeddings_model
472
+ )
473
+
474
+ lnclite = cls(
475
+ lancedb_path=lancedb_path,
476
+ manifest_table=manifest_table,
477
+ document_table=document_table,
478
+ openai_embeddings_model=openai_embeddings_model,
479
+ model_settings=model_settings,
480
+ vector_search_prefer=vector_search_prefer,
481
+ verbose=verbose,
482
+ )
483
+
484
+ # Validate manifest
485
+ manifest = await lnclite.manifest.get()
486
+ if manifest is None:
487
+ raise LncliteNotFoundError("Manifest not found")
488
+ if manifest.model != openai_embeddings_model.model:
489
+ raise ValueError(
490
+ f"OpenAI embeddings model mismatch: {manifest.model} != {openai_embeddings_model.model}" # noqa: E501
491
+ )
492
+ if manifest.dimensions != model_settings.dimensions:
493
+ raise ValueError(
494
+ f"Model settings dimensions mismatch: {manifest.dimensions} != {model_settings.dimensions}" # noqa: E501
495
+ )
496
+
497
+ if refresh_index:
498
+ await lnclite.documents.create_index()
499
+
500
+ return lnclite
501
+
502
+ async def get_connection(self) -> lancedb.AsyncConnection:
503
+ if self._connection is None:
504
+ self._connection = await lancedb.connect_async(self.lancedb_path)
505
+ logger.info(f"Lancedb connected to {self.lancedb_path}")
506
+ return self._connection
507
+
508
+ @functools.cached_property
509
+ def manifest(self) -> "Manifest":
510
+ return Manifest(self)
511
+
512
+ @functools.cached_property
513
+ def documents(self) -> "Documents":
514
+ return Documents(self)
515
+
516
+ async def create_index(self) -> None:
517
+ await self.documents.create_index()
518
+
519
+ async def embed(self, texts: List[Text]) -> np.ndarray:
520
+ emb_res = await self.openai_embeddings_model.get_embeddings(
521
+ texts, model_settings=self.model_settings
522
+ )
523
+ return normalize(emb_res.to_numpy()) # (n, d)
524
+
525
+ async def search(
526
+ self,
527
+ query: Text,
528
+ *,
529
+ tags_any: Optional[List[Text]] = None,
530
+ tags_all: Optional[List[Text]] = None,
531
+ limit: int = 5,
532
+ verbose: bool = False,
533
+ ) -> "SearchResults":
534
+ document_table = await self.documents.get_table()
535
+
536
+ query_vector = (await self.embed([query]))[0]
537
+
538
+ search_query = await document_table.search(query_vector)
539
+ tags_filter = _tags_filter(tags_any=tags_any, tags_all=tags_all)
540
+ if tags_filter is not None:
541
+ search_query = search_query.where(tags_filter)
542
+
543
+ if verbose or self.verbose:
544
+ logger.info(f"Query plan: {await search_query.explain_plan()}")
545
+
546
+ search_results: List[Dict] = (
547
+ await search_query.distance_type("dot").limit(limit).to_list()
548
+ )
549
+
550
+ results: List[SearchResult] = []
551
+ for result in search_results:
552
+ _doc = Document.model_validate(result)
553
+ _distance = result["_distance"]
554
+ results.append(SearchResult(document=_doc, distance=_distance))
555
+
556
+ return SearchResults(results=results)
557
+
558
+
559
+ class Manifest:
560
+ def __init__(self, client: "Lnclite"):
561
+ self.client = client
562
+ self._table: lancedb.AsyncTable | None = None
563
+
564
+ async def get_table(self) -> lancedb.AsyncTable:
565
+ if self._table is not None:
566
+ return self._table
567
+
568
+ conn = await self.client.get_connection()
569
+ if await _table_exists(conn, self.client.manifest_table):
570
+ self._table = await conn.open_table(self.client.manifest_table)
571
+ else:
572
+ self._table = await conn.create_table(
573
+ self.client.manifest_table, schema=self.client._manifest_lancedb_model
574
+ )
575
+ return self._table
576
+
577
+ async def get(self) -> ManifestModel | None:
578
+ manifest_table = await self.get_table()
579
+ _query_builder = manifest_table.query()
580
+ manifests = await _query_builder.limit(1).to_pydantic(
581
+ self.client._manifest_lancedb_model
582
+ )
583
+ if len(manifests) > 0:
584
+ return ManifestModel.model_validate_json(manifests[0].model_dump_json())
585
+ return None
586
+
587
+ async def retrieve(self) -> ManifestModel:
588
+ might_manifest = await self.get()
589
+ if might_manifest is not None:
590
+ return might_manifest
591
+ raise LncliteNotFoundError("Manifest not found")
592
+
593
+ async def upsert(
594
+ self,
595
+ *,
596
+ name: Text,
597
+ description: Text,
598
+ model: Text,
599
+ dimensions: int,
600
+ ) -> ManifestModel:
601
+ table = await self.get_table()
602
+ might_manifest = await self.get()
603
+
604
+ if might_manifest is None:
605
+ manifest = self.client._manifest_lancedb_model(
606
+ name=name, description=description, model=model, dimensions=dimensions
607
+ )
608
+ await table.add([manifest])
609
+
610
+ else:
611
+ manifest = might_manifest
612
+ manifest.name = name
613
+ manifest.description = description
614
+ manifest.model = model
615
+ manifest.dimensions = dimensions
616
+ manifest.last_updated = int(time.time())
617
+ await table.update(where=f"id = {manifest.id}", values=manifest)
618
+
619
+ return ManifestModel.model_validate_json(manifest.model_dump_json())
620
+
621
+
622
+ class Documents:
623
+ def __init__(self, client: "Lnclite"):
624
+ self.client = client
625
+ self._table: lancedb.AsyncTable | None = None
626
+
627
+ async def get_table(self) -> lancedb.AsyncTable:
628
+ if self._table is not None:
629
+ return self._table
630
+
631
+ conn = await self.client.get_connection()
632
+ if await _table_exists(conn, self.client.document_table):
633
+ self._table = await conn.open_table(self.client.document_table)
634
+ else:
635
+ self._table = await conn.create_table(
636
+ self.client.document_table, schema=self.client._document_lancedb_model
637
+ )
638
+ return self._table
639
+
640
+ async def create_index(self) -> None:
641
+ table = await self.get_table()
642
+
643
+ await table.create_index("tags", config=lancedb.index.LabelList())
644
+
645
+ row_count = await self.count()
646
+
647
+ vs_config = recommended_vector_index_config(
648
+ row_count,
649
+ self.client.model_settings.dimensions,
650
+ prefer=self.client.vector_search_prefer,
651
+ )
652
+ if vs_config is None:
653
+ logger.info(
654
+ "Skipping vector index: row_count=%s is too small; brute-force search is exact and fast", # noqa: E501
655
+ row_count,
656
+ )
657
+ else:
658
+ await table.create_index("vector", config=vs_config)
659
+ logger.info(f"Created vector index with config: {vs_config}")
660
+
661
+ async def count(self) -> int:
662
+ document_table = await self.get_table()
663
+ return await document_table.count_rows()
664
+
665
+ async def get(self, id: int) -> Document | None:
666
+ document_table = await self.get_table()
667
+ documents = await (
668
+ document_table.query()
669
+ .where(f"id = {id}")
670
+ .limit(1)
671
+ .to_pydantic(self.client._document_lancedb_model)
672
+ )
673
+ if len(documents) > 0:
674
+ return Document.model_validate_json(documents[0].model_dump_json())
675
+ return None
676
+
677
+ async def retrieve(self, id: int) -> Document:
678
+ might_document = await self.get(id)
679
+ if might_document is not None:
680
+ return might_document
681
+ raise LncliteNotFoundError(f"Document with id {id} not found")
682
+
683
+ async def create(self, document_create: DocumentCreate) -> Document:
684
+ return (await self.batch_create([document_create]))[0]
685
+
686
+ async def batch_create(
687
+ self, document_creates: List[DocumentCreate]
688
+ ) -> List[Document]:
689
+ document_table = await self.get_table()
690
+
691
+ normalized_vectors = await self.client.embed(
692
+ [d.content for d in document_creates]
693
+ )
694
+
695
+ documents = [
696
+ self.client._document_lancedb_model(
697
+ content=d.content, tags=d.tags, vector=v
698
+ )
699
+ for d, v in zip(document_creates, normalized_vectors)
700
+ ]
701
+
702
+ await document_table.add(documents)
703
+
704
+ output: List[Document] = []
705
+ for document, v in zip(documents, normalized_vectors):
706
+ _doc = Document.model_validate_json(
707
+ document.model_dump_json(exclude_none=True)
708
+ )
709
+ _doc.vector = v.tolist()
710
+ output.append(_doc)
711
+
712
+ return output
713
+
714
+ async def list(
715
+ self,
716
+ *,
717
+ tags_any: Optional[List[Text]] = None,
718
+ tags_all: Optional[List[Text]] = None,
719
+ limit: int = 10,
720
+ order: ListOrder = "asc",
721
+ next_page_token: Optional[Text] = None,
722
+ verbose: bool = False,
723
+ ) -> TokenPaginatic[Document]:
724
+ if limit < 1:
725
+ raise ValueError(f"Limit must be greater than 0, got {limit}")
726
+
727
+ sql_order = _to_sql_order(order)
728
+ id_operator = ">" if sql_order == "ASC" else "<"
729
+ after_id: Optional[int] = None
730
+ if next_page_token is not None:
731
+ decoded_token = decode_and_verify(next_page_token, self.client._secret_key)
732
+ after_id = decoded_token.get("after")
733
+
734
+ document_table = await self.client.documents.get_table()
735
+
736
+ # Prepare query
737
+ query_builder = document_table.query()
738
+
739
+ query_builder = query_builder.where(
740
+ _documents_list_where_clause(
741
+ id_operator=id_operator,
742
+ sql_order=sql_order,
743
+ after_id=after_id,
744
+ tags_any=tags_any,
745
+ tags_all=tags_all,
746
+ )
747
+ ).limit(limit + 1)
748
+ if verbose or self.client.verbose:
749
+ logger.info(f"Query plan: {await query_builder.explain_plan()}")
750
+
751
+ # Execute query
752
+ documents = await query_builder.to_pydantic(self.client._document_lancedb_model)
753
+
754
+ has_more = len(documents) > limit
755
+ documents = documents[:limit]
756
+
757
+ _next_token = (
758
+ encode_and_sign({"after": documents[-1].id}, self.client._secret_key)
759
+ if has_more
760
+ else None
761
+ )
762
+
763
+ return TokenPaginatic(
764
+ object="list",
765
+ data=[Document.model_validate_json(d.model_dump_json()) for d in documents],
766
+ next_page_token=_next_token,
767
+ )
768
+
769
+
770
+ class SearchResult(BaseModel):
771
+ document: Document
772
+ distance: float
773
+
774
+
775
+ class SearchResults(BaseModel):
776
+ results: List[SearchResult]
777
+
778
+
779
+ class LncliteNotFoundError(Exception):
780
+ pass
781
+
782
+
783
+ def _documents_list_where_clause(
784
+ *,
785
+ id_operator: Literal[">", "<"],
786
+ sql_order: SqlOrder,
787
+ after_id: Optional[int] = None,
788
+ tags_any: Optional[List[Text]] = None,
789
+ tags_all: Optional[List[Text]] = None,
790
+ ) -> str:
791
+ id_filter = f"id {id_operator} {after_id}" if after_id is not None else "id > 0"
792
+ filters = [id_filter]
793
+
794
+ if tags_filter := _tags_filter(tags_any=tags_any, tags_all=tags_all):
795
+ filters.append(tags_filter)
796
+
797
+ where_clause = " AND ".join(f"({filter_})" for filter_ in filters)
798
+ return f"{where_clause} ORDER BY id {sql_order}"
799
+
800
+
801
+ def _tags_filter(
802
+ *,
803
+ tags_any: Optional[List[Text]] = None,
804
+ tags_all: Optional[List[Text]] = None,
805
+ ) -> Optional[str]:
806
+ filters: List[str] = []
807
+ if tags_any:
808
+ filters.append(tag_filter_any(tags_any))
809
+ if tags_all:
810
+ filters.append(tag_filter_all(tags_all))
811
+ if not filters:
812
+ return None
813
+ return " AND ".join(f"({filter_})" for filter_ in filters)
814
+
815
+
816
+ def _to_sql_order(order: ListOrder) -> SqlOrder:
817
+ if order in ("asc", 1):
818
+ return "ASC"
819
+ if order in ("desc", -1):
820
+ return "DESC"
821
+ raise ValueError(f"Invalid order: {order}")
822
+
823
+
824
+ async def _table_exists(conn: lancedb.AsyncConnection, table_name: Text) -> bool:
825
+ table_list = await conn.list_tables()
826
+ return table_name in table_list.tables
@@ -0,0 +1,133 @@
1
+ """Directory crawling and text extraction from readable files.
2
+
3
+ Provides FilesIngestor for scanning trees while skipping binary and hidden paths.
4
+ """
5
+
6
+ import asyncio
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import (
10
+ AsyncGenerator,
11
+ Awaitable,
12
+ Callable,
13
+ Dict,
14
+ Generator,
15
+ TypeAlias,
16
+ TypedDict,
17
+ cast,
18
+ )
19
+
20
+ logger: logging.Logger = logging.getLogger(__name__)
21
+
22
+ DEFAULT_BINARY_PROBE_CHUNK_SIZE: int = 1024
23
+
24
+ FileReader: TypeAlias = Callable[[Path], str] | Callable[[Path], Awaitable[str]]
25
+
26
+
27
+ class FileIngestorResult(TypedDict):
28
+ path: str
29
+ content: str
30
+
31
+
32
+ class FileIngestor:
33
+ """
34
+ A utility class to crawl directories and extract text content
35
+ from readable files while filtering out binary and hidden data.
36
+ """
37
+
38
+ def __init__(self) -> None:
39
+ # Maps file extensions to specific parsing functions
40
+ self._custom_readers: Dict[str, FileReader] = {}
41
+
42
+ def register_reader(self, extension: str, reader_func: FileReader) -> None:
43
+ """Registers a handler for a specific file extension (e.g., '.pdf')."""
44
+ self._custom_readers[extension.lower()] = reader_func
45
+
46
+ def ingest(self, dir_path: str) -> Generator[FileIngestorResult, None, None]:
47
+ """
48
+ Iterates through the directory and yields documents as they are processed.
49
+ Yields:
50
+ Dict containing 'path' and 'content' keys.
51
+ """
52
+ root = Path(dir_path)
53
+
54
+ for file_path in root.rglob("*"):
55
+ # Ensure it is a file and not an excluded path
56
+ if not file_path.is_file() or self._is_excluded(file_path):
57
+ continue
58
+
59
+ extension = file_path.suffix.lower()
60
+
61
+ try:
62
+ # 1. Use specialized reader if registered
63
+ if extension in self._custom_readers:
64
+ reader = self._custom_readers[extension]
65
+ if asyncio.iscoroutinefunction(reader):
66
+ logger.warning(
67
+ "Skipping %s: async reader requires ingest_async.",
68
+ file_path,
69
+ )
70
+ continue
71
+ sync_reader = cast(Callable[[Path], str], reader)
72
+ content = sync_reader(file_path)
73
+ yield FileIngestorResult(path=str(file_path), content=content)
74
+
75
+ # 2. Fallback to binary probe for generic text files
76
+ elif not self._is_binary(file_path):
77
+ content = self._read_text(file_path)
78
+ yield FileIngestorResult(path=str(file_path), content=content)
79
+
80
+ except Exception as e:
81
+ logger.warning("Skipping %s due to error: %s", file_path, e)
82
+
83
+ async def ingest_async(
84
+ self, dir_path: str
85
+ ) -> AsyncGenerator[FileIngestorResult, None]:
86
+ """Async variant of ingest: walks the tree sync, reads in worker threads."""
87
+ root = Path(dir_path)
88
+
89
+ for file_path in root.rglob("*"):
90
+ if not file_path.is_file() or self._is_excluded(file_path):
91
+ continue
92
+
93
+ extension = file_path.suffix.lower()
94
+
95
+ try:
96
+ if extension in self._custom_readers:
97
+ reader = self._custom_readers[extension]
98
+ is_coro_reader = asyncio.iscoroutinefunction(reader)
99
+ if is_coro_reader:
100
+ content = await reader(file_path)
101
+ else:
102
+ content = reader(file_path)
103
+ yield FileIngestorResult(path=str(file_path), content=content)
104
+ elif not await asyncio.to_thread(self._is_binary, file_path):
105
+ content = await asyncio.to_thread(self._read_text, file_path)
106
+ yield FileIngestorResult(path=str(file_path), content=content)
107
+ except Exception as e:
108
+ logger.warning("Skipping %s due to error: %s", file_path, e)
109
+
110
+ def _is_binary(
111
+ self, file_path: Path, chunk_size: int = DEFAULT_BINARY_PROBE_CHUNK_SIZE
112
+ ) -> bool:
113
+ """Determines if a file is binary using null-byte detection and UTF-8 probing.""" # noqa: E501
114
+ try:
115
+ with open(file_path, "rb") as f:
116
+ chunk = f.read(chunk_size)
117
+ # Null bytes are standard in binary formats
118
+ if b"\0" in chunk:
119
+ return True
120
+ # Attempt decoding to verify it's a valid text format
121
+ chunk.decode("utf-8")
122
+ return False
123
+ except (UnicodeDecodeError, Exception):
124
+ return True
125
+
126
+ def _is_excluded(self, path: Path) -> bool:
127
+ """Checks if any part of the file path starts with '.' or '_'."""
128
+ return any(part.startswith((".", "_")) for part in path.parts)
129
+
130
+ def _read_text(self, file_path: Path) -> str:
131
+ """Reads plain text files with encoding safety."""
132
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
133
+ return f.read()
File without changes
@@ -0,0 +1,18 @@
1
+ """Index parameter helpers for LanceDB vector indexes."""
2
+
3
+ import math
4
+
5
+
6
+ def calculate_index_params(n_rows: int, dimension: int) -> tuple[int, int]:
7
+ max_partitions = n_rows // 256
8
+ num_partitions = min(int(math.sqrt(n_rows) * 8), max_partitions)
9
+ num_partitions = max(num_partitions, 1)
10
+
11
+ target = dimension // 8
12
+ num_sub_vectors = 1
13
+ for i in range(target, 0, -1):
14
+ if dimension % i == 0:
15
+ num_sub_vectors = i
16
+ break
17
+
18
+ return num_partitions, num_sub_vectors
@@ -0,0 +1,13 @@
1
+ from pathlib import Path
2
+
3
+ import xxhash
4
+
5
+
6
+ def get_file_fp(file_path: Path | str) -> str:
7
+ hasher = xxhash.xxh64()
8
+
9
+ with open(file_path, "rb") as f:
10
+ while chunk := f.read(65536):
11
+ hasher.update(chunk)
12
+
13
+ return hasher.hexdigest()
@@ -0,0 +1,37 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import xxhash
5
+
6
+
7
+ def get_folder_fingerprint(target_path: Path | str, read_content: bool = False) -> str:
8
+ # Build a single rolling hash with xxh64.
9
+ folder_hash = xxhash.xxh64()
10
+
11
+ # Keep traversal order stable; otherwise the same tree can hash differently
12
+ # if os.walk yields files in a different order.
13
+ for root, dirs, files in os.walk(target_path):
14
+ for names in sorted(files):
15
+ file_path = os.path.join(root, names)
16
+
17
+ try:
18
+ # 1. Mix in relative path (detects moves/renames).
19
+ rel_path = os.path.relpath(file_path, target_path)
20
+ folder_hash.update(rel_path.encode())
21
+
22
+ # 2. Mix in file metadata — fast default.
23
+ # Use content hashing only if you need stronger accuracy
24
+ # and can pay the I/O cost.
25
+ if read_content:
26
+ # Catches changes where mtime/size might not move (rare edge cases).
27
+ with open(file_path, "rb") as f:
28
+ folder_hash.update(f.read())
29
+ else:
30
+ stat = os.stat(file_path)
31
+ folder_hash.update(str(stat.st_mtime).encode()) # modification time
32
+ folder_hash.update(str(stat.st_size).encode()) # file size
33
+
34
+ except (PermissionError, FileNotFoundError):
35
+ continue
36
+
37
+ return folder_hash.hexdigest()
@@ -0,0 +1,95 @@
1
+ import hashlib
2
+ import os
3
+ import socket
4
+ import threading
5
+ import time
6
+ from typing import Optional
7
+
8
+ _global_generator: Optional["Snowflake"] = None
9
+
10
+
11
+ class Snowflake:
12
+ def __init__(self, worker_id: int):
13
+ self.worker_id = worker_id
14
+ self.sequence = 0
15
+ self.last_timestamp = -1
16
+
17
+ self.twepoch = 1704067200000 # 2024-01-01
18
+
19
+ # Define the bit lengths of each part
20
+ self.worker_id_bits = 10
21
+ self.sequence_bits = 12
22
+
23
+ # Calculate the maximum value
24
+ self.max_worker_id = -1 ^ (-1 << self.worker_id_bits)
25
+
26
+ # Shift amounts
27
+ self.worker_id_shift = self.sequence_bits
28
+ self.timestamp_left_shift = self.sequence_bits + self.worker_id_bits
29
+ self.sequence_mask = -1 ^ (-1 << self.sequence_bits)
30
+
31
+ self.lock = threading.Lock()
32
+
33
+ def _get_timestamp(self):
34
+ return int(time.time() * 1000)
35
+
36
+ def generate(self) -> int:
37
+ with self.lock:
38
+ timestamp = self._get_timestamp()
39
+
40
+ if timestamp < self.last_timestamp:
41
+ raise Exception("Clock backward exception")
42
+
43
+ if timestamp == self.last_timestamp:
44
+ # Within the same millisecond, the sequence number increases
45
+ self.sequence = (self.sequence + 1) & self.sequence_mask
46
+ if self.sequence == 0:
47
+ # If the sequence number is exhausted, wait for the next millisecond
48
+ while timestamp <= self.last_timestamp:
49
+ timestamp = self._get_timestamp()
50
+ else:
51
+ self.sequence = 0
52
+
53
+ self.last_timestamp = timestamp
54
+
55
+ # Combine the parts and perform bitwise left shift
56
+ new_id = (
57
+ ((timestamp - self.twepoch) << self.timestamp_left_shift)
58
+ | (self.worker_id << self.worker_id_shift)
59
+ | self.sequence
60
+ )
61
+
62
+ return new_id
63
+
64
+
65
+ def get_valid_worker_id(max_bits: int = 10) -> int:
66
+ max_worker_id = (1 << max_bits) - 1 # Result of 1023
67
+ worker_id_str = os.getenv("WORKER_ID")
68
+
69
+ # Case 1 and Case 2: Environment variable has value
70
+ if worker_id_str:
71
+ try:
72
+ # Case 1: Standard numeric input
73
+ worker_id = int(worker_id_str)
74
+ # Use modulo to force the number to converge within the valid range
75
+ return abs(worker_id) % (max_worker_id + 1)
76
+ except ValueError:
77
+ # Case 2: Input non-numeric string (e.g. "app-worker-a")
78
+ # Use MD5 hash to convert the string to a large integer,
79
+ # then perform modulo convergence
80
+ hash_int = int(hashlib.md5(worker_id_str.encode("utf-8")).hexdigest(), 16)
81
+ return hash_int % (max_worker_id + 1)
82
+
83
+ # Case 3: No WORKER_ID is set (forced fallback)
84
+ # Get the hostname of the machine as the唯一性依據
85
+ hostname = socket.gethostname()
86
+ hash_int = int(hashlib.md5(hostname.encode("utf-8")).hexdigest(), 16)
87
+
88
+ return hash_int % (max_worker_id + 1)
89
+
90
+
91
+ def generate_id() -> int:
92
+ global _global_generator
93
+ if _global_generator is None:
94
+ _global_generator = Snowflake(worker_id=get_valid_worker_id())
95
+ return _global_generator.generate()
@@ -0,0 +1,59 @@
1
+ [project]
2
+ authors = [{ name = "Allen Chou", email = "f1470891079@gmail.com" }]
3
+ dependencies = [
4
+ "lancedb",
5
+ "openai",
6
+ "openai-embeddings-model",
7
+ "paginatic",
8
+ "pydantic (>=2)",
9
+ "xxhash",
10
+ ]
11
+ description = "Lite usages of lancedb."
12
+ license = "MIT"
13
+ license-files = ["LICENSE"]
14
+ name = "lnclite"
15
+ readme = "README.md"
16
+ requires-python = ">=3.11,<4"
17
+ version = "0.1.0"
18
+
19
+ [project.urls]
20
+ Homepage = "https://github.com/allen2c/lnclite"
21
+ "PyPI" = "https://pypi.org/project/lnclite/"
22
+ Repository = "https://github.com/allen2c/lnclite"
23
+
24
+ [tool.poetry]
25
+ packages = [{ include = "lnclite" }]
26
+
27
+ [tool.poetry.group.dev.dependencies]
28
+ black = { extras = ["jupyter"], version = "*" }
29
+ isort = "*"
30
+ lines-of-work = { git = "https://github.com/allen2c/lines-of-work.git" }
31
+ logging-bullet-train = ">=0.4.0"
32
+ mkdocs-material = "*"
33
+ poetry-plugin-export = "*"
34
+ pytest = "*"
35
+ pytest-asyncio = "*"
36
+ pytest-cov = "*"
37
+ pytest-env = "*"
38
+ pytest-xdist = "*"
39
+ rich = "*"
40
+ rich-color-support = "*"
41
+ setuptools = "*"
42
+ twine = "*"
43
+
44
+ [tool.isort]
45
+ profile = "black"
46
+
47
+ [tool.black]
48
+ target-version = ["py311"]
49
+
50
+ [tool.flake8]
51
+ ignore = ["E203", "E704", "W503"]
52
+ max-line-length = 88
53
+
54
+ [tool.pytest.ini_options]
55
+ env = ["ENVIRONMENT=test", "PYTEST_IS_RUNNING=true"]
56
+
57
+ [build-system]
58
+ build-backend = "poetry.core.masonry.api"
59
+ requires = ["poetry-core>=2.0.0,<3.0.0"]