scruby 0.10.4__py3-none-any.whl → 0.26.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of scruby might be problematic. Click here for more details.

scruby/db.py CHANGED
@@ -1,437 +1,204 @@
1
- """Creation and management of the database."""
2
-
3
- from __future__ import annotations
4
-
5
- __all__ = ("Scruby",)
6
-
7
- import concurrent.futures
8
- import contextlib
9
- import logging
10
- import zlib
11
- from collections.abc import Callable
12
- from pathlib import Path as SyncPath
13
- from shutil import rmtree
14
- from typing import Any, Literal, Never, TypeVar, assert_never
15
-
16
- import orjson
17
- from anyio import Path, to_thread
18
- from pydantic import BaseModel
19
-
20
- from scruby import constants
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
- T = TypeVar("T")
25
-
26
-
27
- class _Meta(BaseModel):
28
- """Metadata of Collection."""
29
-
30
- counter_documents: int
31
-
32
-
33
- class Scruby[T]:
34
- """Creation and management of database.
35
-
36
- Args:
37
- class_model: Class of Model (Pydantic).
38
- """
39
-
40
- def __init__( # noqa: D107
41
- self,
42
- class_model: T,
43
- ) -> None:
44
- self.__meta = _Meta
45
- self.__class_model = class_model
46
- self.__db_root = constants.DB_ROOT
47
- self.__hash_reduce_left = constants.HASH_REDUCE_LEFT
48
- # The maximum number of keys.
49
- match self.__hash_reduce_left:
50
- case 0:
51
- self.__max_num_keys = 4294967296
52
- case 2:
53
- self.__max_num_keys = 16777216
54
- case 4:
55
- self.__max_num_keys = 65536
56
- case 6:
57
- self.__max_num_keys = 256
58
- case _ as unreachable:
59
- msg: str = f"{unreachable} - Unacceptable value for HASH_REDUCE_LEFT."
60
- logger.critical(msg)
61
- assert_never(Never(unreachable))
62
- # 1.Create metadata if absent.
63
- # 2.Check metadata.
64
- self._create_metadata()
65
-
66
- def _create_metadata(self) -> None:
67
- """Create metadata for collection if absent.
68
-
69
- This method is for internal use.
70
- """
71
- key: int = 0
72
- key_as_hash: str = f"{key:08x}"[self.__hash_reduce_left :]
73
- separated_hash: str = "/".join(list(key_as_hash))
74
- branch_path = SyncPath(
75
- *(
76
- self.__db_root,
77
- self.__class_model.__name__,
78
- separated_hash,
79
- ),
80
- )
81
- if not branch_path.exists():
82
- branch_path.mkdir(parents=True)
83
- meta = _Meta(
84
- counter_documents=0,
85
- )
86
- meta_json = meta.model_dump_json()
87
- meta_path = SyncPath(*(branch_path, "meta.json"))
88
- meta_path.write_text(meta_json, "utf-8")
89
-
90
- async def _get_meta_path(self) -> Path:
91
- """Asynchronous method for getting path to metadata of collection.
92
-
93
- This method is for internal use.
94
- """
95
- key: int = 0
96
- key_as_hash: str = f"{key:08x}"[self.__hash_reduce_left :]
97
- separated_hash: str = "/".join(list(key_as_hash))
98
- return Path(
99
- *(
100
- self.__db_root,
101
- self.__class_model.__name__,
102
- separated_hash,
103
- "meta.json",
104
- ),
105
- )
106
-
107
- async def _get_meta(self) -> _Meta:
108
- """Asynchronous method for getting metadata of collection.
109
-
110
- This method is for internal use.
111
- """
112
- meta_path = await self._get_meta_path()
113
- meta_json = await meta_path.read_text()
114
- meta: _Meta = self.__meta.model_validate_json(meta_json)
115
- return meta
116
-
117
- async def _set_meta(self, meta: _Meta) -> None:
118
- """Asynchronous method for updating metadata of collection.
119
-
120
- This method is for internal use.
121
- """
122
- meta_path = await self._get_meta_path()
123
- meta_json = meta.model_dump_json()
124
- await meta_path.write_text(meta_json, "utf-8")
125
-
126
- async def _counter_documents(self, step: Literal[1, -1]) -> None:
127
- """Management of documents in metadata of collection.
128
-
129
- This method is for internal use.
130
- """
131
- meta = await self._get_meta()
132
- meta.counter_documents += step
133
- if meta.counter_documents < 0:
134
- meta.counter_documents = 0
135
- await self._set_meta(meta)
136
-
137
- async def _get_leaf_path(self, key: str) -> Path:
138
- """Asynchronous method for getting path to collection cell by key.
139
-
140
- This method is for internal use.
141
-
142
- Args:
143
- key: Key name.
144
- """
145
- if not isinstance(key, str):
146
- logger.error("The key is not a type of `str`.")
147
- raise KeyError("The key is not a type of `str`.")
148
- if len(key) == 0:
149
- logger.error("The key should not be empty.")
150
- raise KeyError("The key should not be empty.")
151
- # Key to crc32 sum.
152
- key_as_hash: str = f"{zlib.crc32(key.encode('utf-8')):08x}"[self.__hash_reduce_left :]
153
- # Convert crc32 sum in the segment of path.
154
- separated_hash: str = "/".join(list(key_as_hash))
155
- # The path of the branch to the database.
156
- branch_path: Path = Path(
157
- *(
158
- self.__db_root,
159
- self.__class_model.__name__,
160
- separated_hash,
161
- ),
162
- )
163
- # If the branch does not exist, need to create it.
164
- if not await branch_path.exists():
165
- await branch_path.mkdir(parents=True)
166
- # The path to the database cell.
167
- leaf_path: Path = Path(*(branch_path, "leaf.json"))
168
- return leaf_path
169
-
170
- async def set_key(
171
- self,
172
- key: str,
173
- value: T,
174
- ) -> None:
175
- """Asynchronous method for adding and updating keys to collection.
176
-
177
- Args:
178
- key: Key name.
179
- value: Value of key.
180
- """
181
- # The path to the database cell.
182
- leaf_path: Path = await self._get_leaf_path(key)
183
- value_json: str = value.model_dump_json()
184
- # Write key-value to the database.
185
- if await leaf_path.exists():
186
- # Add new key or update existing.
187
- data_json: bytes = await leaf_path.read_bytes()
188
- data: dict = orjson.loads(data_json) or {}
189
- if data.get(key) is None:
190
- await self._counter_documents(1)
191
- data[key] = value_json
192
- await leaf_path.write_bytes(orjson.dumps(data))
193
- else:
194
- # Add new key to a blank leaf.
195
- await leaf_path.write_bytes(orjson.dumps({key: value_json}))
196
- await self._counter_documents(1)
197
-
198
- async def get_key(self, key: str) -> T:
199
- """Asynchronous method for getting value of key from collection.
200
-
201
- Args:
202
- key: Key name.
203
- """
204
- # The path to the database cell.
205
- leaf_path: Path = await self._get_leaf_path(key)
206
- # Get value of key.
207
- if await leaf_path.exists():
208
- data_json: bytes = await leaf_path.read_bytes()
209
- data: dict = orjson.loads(data_json) or {}
210
- obj: T = self.__class_model.model_validate_json(data[key])
211
- return obj
212
- msg: str = "`get_key` - The unacceptable key value."
213
- logger.error(msg)
214
- raise KeyError()
215
-
216
- async def has_key(self, key: str) -> bool:
217
- """Asynchronous method for checking presence of key in collection.
218
-
219
- Args:
220
- key: Key name.
221
- """
222
- # The path to the database cell.
223
- leaf_path: Path = await self._get_leaf_path(key)
224
- # Checking whether there is a key.
225
- if await leaf_path.exists():
226
- data_json: bytes = await leaf_path.read_bytes()
227
- data: dict = orjson.loads(data_json) or {}
228
- try:
229
- data[key]
230
- return True
231
- except KeyError:
232
- return False
233
- return False
234
-
235
- async def delete_key(self, key: str) -> None:
236
- """Asynchronous method for deleting key from collection.
237
-
238
- Args:
239
- key: Key name.
240
- """
241
- # The path to the database cell.
242
- leaf_path: Path = await self._get_leaf_path(key)
243
- # Deleting key.
244
- if await leaf_path.exists():
245
- data_json: bytes = await leaf_path.read_bytes()
246
- data: dict = orjson.loads(data_json) or {}
247
- del data[key]
248
- await leaf_path.write_bytes(orjson.dumps(data))
249
- await self._counter_documents(-1)
250
- return
251
- msg: str = "`delete_key` - The unacceptable key value."
252
- logger.error(msg)
253
- raise KeyError()
254
-
255
- @staticmethod
256
- async def napalm() -> None:
257
- """Asynchronous method for full database deletion.
258
-
259
- The main purpose is tests.
260
-
261
- Warning:
262
- - `Be careful, this will remove all keys.`
263
- """
264
- with contextlib.suppress(FileNotFoundError):
265
- await to_thread.run_sync(rmtree, constants.DB_ROOT)
266
- return
267
-
268
- @staticmethod
269
- def _task_find(
270
- key: int,
271
- filter_fn: Callable,
272
- HASH_REDUCE_LEFT: str,
273
- db_root: str,
274
- class_model: T,
275
- ) -> dict[str, Any] | None:
276
- """Task for searching for documents.
277
-
278
- This method is for internal use.
279
- """
280
- key_as_hash: str = f"{key:08x}"[HASH_REDUCE_LEFT:]
281
- separated_hash: str = "/".join(list(key_as_hash))
282
- leaf_path: SyncPath = SyncPath(
283
- *(
284
- db_root,
285
- class_model.__name__,
286
- separated_hash,
287
- "leaf.json",
288
- ),
289
- )
290
- if leaf_path.exists():
291
- data_json: bytes = leaf_path.read_bytes()
292
- data: dict[str, str] = orjson.loads(data_json) or {}
293
- for _, val in data.items():
294
- doc = class_model.model_validate_json(val)
295
- if filter_fn(doc):
296
- return doc
297
- return None
298
-
299
- def find_one(
300
- self,
301
- filter_fn: Callable,
302
- max_workers: int | None = None,
303
- timeout: float | None = None,
304
- ) -> T | None:
305
- """Find a single document matching the filter.
306
-
307
- The search is based on the effect of a quantum loop.
308
- The search effectiveness depends on the number of processor threads.
309
- Ideally, hundreds and even thousands of threads are required.
310
-
311
- Args:
312
- filter_fn: A function that execute the conditions of filtering.
313
- max_workers: The maximum number of processes that can be used to
314
- execute the given calls. If None or not given then as many
315
- worker processes will be created as the machine has processors.
316
- timeout: The number of seconds to wait for the result if the future isn't done.
317
- If None, then there is no limit on the wait time.
318
- """
319
- keys: range = range(1, self.__max_num_keys)
320
- search_task_fn: Callable = self._task_find
321
- HASH_REDUCE_LEFT: int = self.__hash_reduce_left
322
- db_root: str = self.__db_root
323
- class_model: T = self.__class_model
324
- with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
325
- for key in keys:
326
- future = executor.submit(
327
- search_task_fn,
328
- key,
329
- filter_fn,
330
- HASH_REDUCE_LEFT,
331
- db_root,
332
- class_model,
333
- )
334
- doc = future.result(timeout)
335
- if doc is not None:
336
- return doc
337
- return None
338
-
339
- def find(
340
- self,
341
- filter_fn: Callable,
342
- db_query_docs_limit: int = 1000,
343
- max_workers: int | None = None,
344
- timeout: float | None = None,
345
- ) -> list[T] | None:
346
- """Find one or more documents matching the filter.
347
-
348
- The search is based on the effect of a quantum loop.
349
- The search effectiveness depends on the number of processor threads.
350
- Ideally, hundreds and even thousands of threads are required.
351
-
352
- Args:
353
- filter_fn: A function that execute the conditions of filtering.
354
- db_query_docs_limit: Limiting the number of request results. By default = 1000.
355
- max_workers: The maximum number of processes that can be used to
356
- execute the given calls. If None or not given then as many
357
- worker processes will be created as the machine has processors.
358
- timeout: The number of seconds to wait for the result if the future isn't done.
359
- If None, then there is no limit on the wait time.
360
- """
361
- keys: range = range(1, self.__max_num_keys)
362
- search_task_fn: Callable = self._task_find
363
- HASH_REDUCE_LEFT: int = self.__hash_reduce_left
364
- db_root: str = self.__db_root
365
- class_model: T = self.__class_model
366
- counter: int = 0
367
- with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
368
- results = []
369
- for key in keys:
370
- if counter == db_query_docs_limit:
371
- break
372
- future = executor.submit(
373
- search_task_fn,
374
- key,
375
- filter_fn,
376
- HASH_REDUCE_LEFT,
377
- db_root,
378
- class_model,
379
- )
380
- doc = future.result(timeout)
381
- if doc is not None:
382
- results.append(doc)
383
- counter += 1
384
- return results or None
385
-
386
- def collection_name(self) -> str:
387
- """Get collection name."""
388
- return self.__class_model.__name__
389
-
390
- def collection_full_name(self) -> str:
391
- """Get full name of collection."""
392
- return f"{self.__db_root}/{self.__class_model.__name__}"
393
-
394
- async def estimated_document_count(self) -> int:
395
- """Get an estimate of the number of documents in this collection using collection metadata."""
396
- meta = await self._get_meta()
397
- return meta.counter_documents
398
-
399
- def count_documents(
400
- self,
401
- filter_fn: Callable,
402
- max_workers: int | None = None,
403
- timeout: float | None = None,
404
- ) -> int:
405
- """Count the number of documents a matching the filter in this collection.
406
-
407
- The search is based on the effect of a quantum loop.
408
- The search effectiveness depends on the number of processor threads.
409
- Ideally, hundreds and even thousands of threads are required.
410
-
411
- Args:
412
- filter_fn: A function that execute the conditions of filtering.
413
- max_workers: The maximum number of processes that can be used to
414
- execute the given calls. If None or not given then as many
415
- worker processes will be created as the machine has processors.
416
- timeout: The number of seconds to wait for the result if the future isn't done.
417
- If None, then there is no limit on the wait time.
418
- """
419
- keys: range = range(1, self.__max_num_keys)
420
- search_task_fn: Callable = self._task_find
421
- HASH_REDUCE_LEFT: int = self.__hash_reduce_left
422
- db_root: str = self.__db_root
423
- class_model: T = self.__class_model
424
- counter: int = 0
425
- with concurrent.futures.ThreadPoolExecutor(max_workers) as executor:
426
- for key in keys:
427
- future = executor.submit(
428
- search_task_fn,
429
- key,
430
- filter_fn,
431
- HASH_REDUCE_LEFT,
432
- db_root,
433
- class_model,
434
- )
435
- if future.result(timeout) is not None:
436
- counter += 1
437
- return counter
1
+ """Creation and management of the database."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __all__ = ("Scruby",)
6
+
7
+ import contextlib
8
+ import logging
9
+ import re
10
+ import zlib
11
+ from shutil import rmtree
12
+ from typing import Any, Literal, Never, assert_never
13
+
14
+ from anyio import Path
15
+ from pydantic import BaseModel
16
+
17
+ from scruby import constants, mixins
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class _Meta(BaseModel):
23
+ """Metadata of Collection."""
24
+
25
+ db_root: str
26
+ collection_name: str
27
+ hash_reduce_left: int
28
+ max_branch_number: int
29
+ counter_documents: int
30
+
31
+
32
+ class Scruby(
33
+ mixins.Docs,
34
+ mixins.Find,
35
+ mixins.CustomTask,
36
+ mixins.Collection,
37
+ mixins.Count,
38
+ mixins.Delete,
39
+ mixins.Update,
40
+ ):
41
+ """Creation and management of database."""
42
+
43
+ def __init__( # noqa: D107
44
+ self,
45
+ ) -> None:
46
+ super().__init__()
47
+ self._meta = _Meta
48
+ self._db_root = constants.DB_ROOT
49
+ self._hash_reduce_left = constants.HASH_REDUCE_LEFT
50
+ # The maximum number of branches.
51
+ match self._hash_reduce_left:
52
+ case 0:
53
+ self._max_branch_number = 4294967296
54
+ case 2:
55
+ self._max_branch_number = 16777216
56
+ case 4:
57
+ self._max_branch_number = 65536
58
+ case 6:
59
+ self._max_branch_number = 256
60
+ case _ as unreachable:
61
+ msg: str = f"{unreachable} - Unacceptable value for HASH_REDUCE_LEFT."
62
+ logger.critical(msg)
63
+ assert_never(Never(unreachable)) # pyrefly: ignore[not-callable]
64
+
65
+ @classmethod
66
+ async def collection(cls, class_model: Any) -> Any:
67
+ """Get an object to access a collection.
68
+
69
+ Args:
70
+ class_model: Class of Model (pydantic.BaseModel).
71
+
72
+ Returns:
73
+ Instance of Scruby for access a collection.
74
+ """
75
+ assert BaseModel in class_model.__bases__, "`class_model` does not contain the base class `pydantic.BaseModel`!"
76
+
77
+ instance = cls()
78
+ instance.__dict__["_class_model"] = class_model
79
+ # Caching a pati for metadata.
80
+ # The zero branch is reserved for metadata.
81
+ branch_number: int = 0
82
+ branch_number_as_hash: str = f"{branch_number:08x}"[constants.HASH_REDUCE_LEFT :]
83
+ separated_hash: str = "/".join(list(branch_number_as_hash))
84
+ meta_dir_path_tuple = (
85
+ constants.DB_ROOT,
86
+ class_model.__name__,
87
+ separated_hash,
88
+ )
89
+ instance.__dict__["_meta_path"] = Path(
90
+ *meta_dir_path_tuple,
91
+ "meta.json",
92
+ )
93
+ # Create metadata for collection, if missing.
94
+ branch_path = Path(*meta_dir_path_tuple)
95
+ if not await branch_path.exists():
96
+ await branch_path.mkdir(parents=True)
97
+ meta = _Meta(
98
+ db_root=constants.DB_ROOT,
99
+ collection_name=class_model.__name__,
100
+ hash_reduce_left=constants.HASH_REDUCE_LEFT,
101
+ max_branch_number=instance.__dict__["_max_branch_number"],
102
+ counter_documents=0,
103
+ )
104
+ meta_json = meta.model_dump_json()
105
+ meta_path = Path(*(branch_path, "meta.json"))
106
+ await meta_path.write_text(meta_json, "utf-8")
107
+ return instance
108
+
109
+ async def get_meta(self) -> _Meta:
110
+ """Asynchronous method for getting metadata of collection.
111
+
112
+ This method is for internal use.
113
+
114
+ Returns:
115
+ Metadata object.
116
+ """
117
+ meta_json = await self._meta_path.read_text()
118
+ meta: _Meta = self._meta.model_validate_json(meta_json)
119
+ return meta
120
+
121
+ async def _set_meta(self, meta: _Meta) -> None:
122
+ """Asynchronous method for updating metadata of collection.
123
+
124
+ This method is for internal use.
125
+
126
+ Returns:
127
+ None.
128
+ """
129
+ meta_json = meta.model_dump_json()
130
+ await self._meta_path.write_text(meta_json, "utf-8")
131
+
132
+ async def _counter_documents(self, step: Literal[1, -1]) -> None:
133
+ """Asynchronous method for management of documents in metadata of collection.
134
+
135
+ This method is for internal use.
136
+
137
+ Returns:
138
+ None.
139
+ """
140
+ meta_path = self._meta_path
141
+ meta_json = await meta_path.read_text("utf-8")
142
+ meta: _Meta = self._meta.model_validate_json(meta_json)
143
+ meta.counter_documents += step
144
+ meta_json = meta.model_dump_json()
145
+ await meta_path.write_text(meta_json, "utf-8")
146
+
147
+ async def _get_leaf_path(self, key: str) -> tuple[Path, str]:
148
+ """Asynchronous method for getting path to collection cell by key.
149
+
150
+ This method is for internal use.
151
+
152
+ Args:
153
+ key (str): Key name.
154
+
155
+ Returns:
156
+ Path to cell of collection.
157
+ """
158
+ if not isinstance(key, str):
159
+ msg = "The key is not a string."
160
+ logger.error(msg)
161
+ raise KeyError(msg)
162
+ # Prepare key.
163
+ # Removes spaces at the beginning and end of a string.
164
+ # Replaces all whitespace characters with a single space.
165
+ prepared_key = re.sub(r"\s+", " ", key).strip().lower()
166
+ # Check the key for an empty string.
167
+ if len(prepared_key) == 0:
168
+ msg = "The key should not be empty."
169
+ logger.error(msg)
170
+ raise KeyError(msg)
171
+ # Key to crc32 sum.
172
+ key_as_hash: str = f"{zlib.crc32(prepared_key.encode('utf-8')):08x}"[self._hash_reduce_left :]
173
+ # Convert crc32 sum in the segment of path.
174
+ separated_hash: str = "/".join(list(key_as_hash))
175
+ # The path of the branch to the database.
176
+ branch_path: Path = Path(
177
+ *(
178
+ self._db_root,
179
+ self._class_model.__name__,
180
+ separated_hash,
181
+ ),
182
+ )
183
+ # If the branch does not exist, need to create it.
184
+ if not await branch_path.exists():
185
+ await branch_path.mkdir(parents=True)
186
+ # The path to the database cell.
187
+ leaf_path: Path = Path(*(branch_path, "leaf.json"))
188
+ return (leaf_path, prepared_key)
189
+
190
+ @staticmethod
191
+ def napalm() -> None:
192
+ """Method for full database deletion.
193
+
194
+ The main purpose is tests.
195
+
196
+ Warning:
197
+ - `Be careful, this will remove all keys.`
198
+
199
+ Returns:
200
+ None.
201
+ """
202
+ with contextlib.suppress(FileNotFoundError):
203
+ rmtree(constants.DB_ROOT)
204
+ return