datachain 0.7.5__py3-none-any.whl → 0.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -65,7 +65,6 @@ def udf_entrypoint() -> int:
65
65
  dispatch = UDFDispatcher(
66
66
  udf_info["udf_data"],
67
67
  udf_info["catalog_init"],
68
- udf_info["id_generator_clone_params"],
69
68
  udf_info["metastore_clone_params"],
70
69
  udf_info["warehouse_clone_params"],
71
70
  udf_fields=udf_info["udf_fields"],
@@ -119,7 +118,6 @@ class UDFDispatcher:
119
118
  self,
120
119
  udf_data,
121
120
  catalog_init_params,
122
- id_generator_clone_params,
123
121
  metastore_clone_params,
124
122
  warehouse_clone_params,
125
123
  udf_fields: "Sequence[str]",
@@ -129,11 +127,6 @@ class UDFDispatcher:
129
127
  ):
130
128
  self.udf_data = udf_data
131
129
  self.catalog_init_params = catalog_init_params
132
- (
133
- self.id_generator_class,
134
- self.id_generator_args,
135
- self.id_generator_kwargs,
136
- ) = id_generator_clone_params
137
130
  (
138
131
  self.metastore_class,
139
132
  self.metastore_args,
@@ -155,18 +148,13 @@ class UDFDispatcher:
155
148
 
156
149
  def _create_worker(self) -> "UDFWorker":
157
150
  if not self.catalog:
158
- id_generator = self.id_generator_class(
159
- *self.id_generator_args, **self.id_generator_kwargs
160
- )
161
151
  metastore = self.metastore_class(
162
152
  *self.metastore_args, **self.metastore_kwargs
163
153
  )
164
154
  warehouse = self.warehouse_class(
165
155
  *self.warehouse_args, **self.warehouse_kwargs
166
156
  )
167
- self.catalog = Catalog(
168
- id_generator, metastore, warehouse, **self.catalog_init_params
169
- )
157
+ self.catalog = Catalog(metastore, warehouse, **self.catalog_init_params)
170
158
  self.udf = loads(self.udf_data)
171
159
  return UDFWorker(
172
160
  self.catalog,
@@ -85,7 +85,6 @@ class Session:
85
85
  if self.is_new_catalog:
86
86
  self.catalog.metastore.close_on_exit()
87
87
  self.catalog.warehouse.close_on_exit()
88
- self.catalog.id_generator.close_on_exit()
89
88
 
90
89
  if Session.SESSION_CONTEXTS:
91
90
  Session.SESSION_CONTEXTS.pop()
@@ -1,3 +1,4 @@
1
+ import base64
1
2
  import json
2
3
  import logging
3
4
  import os
@@ -23,7 +24,8 @@ DatasetStatsData = Optional[DatasetStats]
23
24
  DatasetRowsData = Optional[Iterable[dict[str, Any]]]
24
25
  DatasetExportStatus = Optional[dict[str, Any]]
25
26
  DatasetExportSignedUrls = Optional[list[str]]
26
-
27
+ FileUploadData = Optional[dict[str, Any]]
28
+ JobData = Optional[dict[str, Any]]
27
29
 
28
30
  logger = logging.getLogger("datachain")
29
31
 
@@ -308,3 +310,33 @@ class StudioClient:
308
310
  "datachain/dataset-export-status",
309
311
  {"dataset_name": name, "dataset_version": version},
310
312
  )
313
+
314
+ def upload_file(self, file_name: str, content: bytes) -> Response[FileUploadData]:
315
+ data = {
316
+ "file_content": base64.b64encode(content).decode("utf-8"),
317
+ "file_name": file_name,
318
+ }
319
+ return self._send_request("datachain/upload-file", data)
320
+
321
+ def create_job(
322
+ self,
323
+ query: str,
324
+ query_type: str,
325
+ environment: Optional[str] = None,
326
+ workers: Optional[int] = None,
327
+ query_name: Optional[str] = None,
328
+ files: Optional[list[str]] = None,
329
+ python_version: Optional[str] = None,
330
+ requirements: Optional[str] = None,
331
+ ) -> Response[JobData]:
332
+ data = {
333
+ "query": query,
334
+ "query_type": query_type,
335
+ "environment": environment,
336
+ "workers": workers,
337
+ "query_name": query_name,
338
+ "files": files,
339
+ "python_version": python_version,
340
+ "requirements": requirements,
341
+ }
342
+ return self._send_request("datachain/job", data)
datachain/studio.py CHANGED
@@ -34,6 +34,19 @@ def process_studio_cli_args(args: "Namespace"):
34
34
  print(tabulate(rows, headers="keys"))
35
35
  return 0
36
36
 
37
+ if args.cmd == "run":
38
+ return create_job(
39
+ args.query_file,
40
+ args.team,
41
+ args.env_file,
42
+ args.env,
43
+ args.workers,
44
+ args.files,
45
+ args.python_version,
46
+ args.req,
47
+ args.req_file,
48
+ )
49
+
37
50
  if args.cmd == "team":
38
51
  return set_team(args)
39
52
  raise DataChainError(f"Unknown command '{args.cmd}'.")
@@ -168,3 +181,70 @@ def save_config(hostname, token):
168
181
  conf["studio"] = studio_conf
169
182
 
170
183
  return config.config_file()
184
+
185
+
186
+ def create_job(
187
+ query_file: str,
188
+ team_name: Optional[str],
189
+ env_file: Optional[str] = None,
190
+ env: Optional[list[str]] = None,
191
+ workers: Optional[int] = None,
192
+ files: Optional[list[str]] = None,
193
+ python_version: Optional[str] = None,
194
+ req: Optional[list[str]] = None,
195
+ req_file: Optional[str] = None,
196
+ ):
197
+ query_type = "PYTHON" if query_file.endswith(".py") else "SHELL"
198
+ with open(query_file) as f:
199
+ query = f.read()
200
+
201
+ environment = "\n".join(env) if env else ""
202
+ if env_file:
203
+ with open(env_file) as f:
204
+ environment = f.read() + "\n" + environment
205
+
206
+ requirements = "\n".join(req) if req else ""
207
+ if req_file:
208
+ with open(req_file) as f:
209
+ requirements = f.read() + "\n" + requirements
210
+
211
+ client = StudioClient(team=team_name)
212
+ file_ids = upload_files(client, files) if files else []
213
+
214
+ response = client.create_job(
215
+ query=query,
216
+ query_type=query_type,
217
+ environment=environment,
218
+ workers=workers,
219
+ query_name=os.path.basename(query_file),
220
+ files=file_ids,
221
+ python_version=python_version,
222
+ requirements=requirements,
223
+ )
224
+ if not response.ok:
225
+ raise_remote_error(response.message)
226
+
227
+ if not response.data:
228
+ raise DataChainError("Failed to create job")
229
+
230
+ print(f"Job {response.data.get('job', {}).get('id')} created")
231
+ print("Open the job in Studio at", response.data.get("job", {}).get("url"))
232
+
233
+
234
+ def upload_files(client: StudioClient, files: list[str]) -> list[str]:
235
+ file_ids = []
236
+ for file in files:
237
+ file_name = os.path.basename(file)
238
+ with open(file, "rb") as f:
239
+ file_content = f.read()
240
+ response = client.upload_file(file_name, file_content)
241
+ if not response.ok:
242
+ raise_remote_error(response.message)
243
+
244
+ if not response.data:
245
+ raise DataChainError(f"Failed to upload file {file_name}")
246
+
247
+ file_id = response.data.get("blob", {}).get("id")
248
+ if file_id:
249
+ file_ids.append(str(file_id))
250
+ return file_ids
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.5
3
+ Version: 0.7.6
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -2,10 +2,10 @@ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
4
4
  datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
5
- datachain/cli.py,sha256=1hiBClE1kbRyx0DK3uX5KMVa0ktbsG6TsFSNvoT2xxs,39399
5
+ datachain/cli.py,sha256=Ysm-6Kb-54FfkN35VJIe5vW7Kik8VGA3wcyCUnqPBHg,42245
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
8
- datachain/dataset.py,sha256=4KciGp7XesQ8MsAzs8G7854ZoVcMdPE0ultVh-XSkIw,18597
8
+ datachain/dataset.py,sha256=P-pDBgvPqJGDhq_I7fwCfb6hY8E8mIAO8Q0NT7SNlNE,19128
9
9
  datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=TgKg25ZWAP5enzKgw2_2GUPJVdnQUh6uySHB5SJrUY4,7773
@@ -14,13 +14,13 @@ datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,11
14
14
  datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
15
15
  datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- datachain/studio.py,sha256=w41vgVPrBfJ02XQOaDccLbh-1uSAfq9cAgOmkYUqExE,4845
17
+ datachain/studio.py,sha256=MthVADn-jM2I5TlESOfbzFnKGZjpuk9bM8m2vqOK-C8,7227
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
21
- datachain/catalog/catalog.py,sha256=l_HAxor5i_F03VvbmMuwhi4INhsmNrqubyydPhXWo2Y,57980
21
+ datachain/catalog/catalog.py,sha256=s4fat0jjP3JPq0RGQ9zfzRkX1JavxxCrcB1tJKMgsks,57686
22
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
23
- datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
+ datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
24
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
25
25
  datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
26
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
@@ -29,15 +29,14 @@ datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
29
29
  datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
30
30
  datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
31
31
  datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
32
- datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
32
+ datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
33
33
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
34
- datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
35
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
36
- datachain/data_storage/metastore.py,sha256=VPq-Dl8P-RbZQMzn6vB9aXBPKUWPTwP8ypkaVfE-7PU,37661
35
+ datachain/data_storage/metastore.py,sha256=EzSsfR_l_84i1AewYygpdsJyzGqEmvXjpeohlYF7h4A,37435
37
36
  datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu-AY,9895
38
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
39
- datachain/data_storage/sqlite.py,sha256=nF-2B-n8YZh9cJlZv4XnbahAJDW6pvrp1h9L-140M7A,27538
40
- datachain/data_storage/warehouse.py,sha256=kFLhYEFkpsfl65Lr1c4t4HJt3nO1Ez_QQ76aQNN30fc,30966
38
+ datachain/data_storage/sqlite.py,sha256=D_ZQ0PHmZzHO2dinv4naVJocUDIZUwV4WAz692C1cyk,22521
39
+ datachain/data_storage/warehouse.py,sha256=tjIkU-5JywBR0apCyqTcwSyaRtGxhu2L7IVjrz-55uc,30802
41
40
  datachain/func/__init__.py,sha256=4VUt5BaLdBAl_BnAku0Jb8plqd7kDOiYrQTMG3pN0c4,794
42
41
  datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
43
42
  datachain/func/array.py,sha256=zHDNWuWLA7HVa9FEvQeHhVi00_xqenyleTqcLwkXWBI,5477
@@ -61,7 +60,7 @@ datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
61
60
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
62
61
  datachain/lib/meta_formats.py,sha256=anK2bDVbaeCCh0yvKUBaW2MVos3zRgdaSV8uSduzPcU,6680
63
62
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
64
- datachain/lib/pytorch.py,sha256=W-ARi2xH1f1DUkVfRuerW-YWYgSaJASmNCxtz2lrJGI,6072
63
+ datachain/lib/pytorch.py,sha256=Nh6fUbQMLX8OpZvX4tw4bJjTCQpRKi0jSLgkJnLHdTM,5880
65
64
  datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
66
65
  datachain/lib/signal_schema.py,sha256=_uh19nCKhiD9ua8oIN1Q8R9iYv1BZAuqTJCLYVmyW8k,24557
67
66
  datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
@@ -88,15 +87,15 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
88
87
  datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
89
88
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
90
89
  datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
91
- datachain/query/dataset.py,sha256=bQVG4WnJfBQpvnxouIdDlsJF2gB8V4lDp4Zu9JeZ-rc,54771
92
- datachain/query/dispatch.py,sha256=wjjTWw6sFQbB9SKRh78VbfvwSMgJXCfqJklS3-9KnCU,12025
90
+ datachain/query/dataset.py,sha256=o9Ssa47t1IM78qcaoCeTL-rp4fZCpYfR7XFjw2hGWeY,54632
91
+ datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
93
92
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
94
93
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
95
94
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
96
95
  datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
97
- datachain/query/session.py,sha256=50SOdLNCjqHHKI-L4xGXyzTVxzMWfANqKqjeYre-c2k,5959
96
+ datachain/query/session.py,sha256=vvLIJ5b8eElovHLAWq_CZJXmN5t7C7iAZA7x9wPPOms,5905
98
97
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
- datachain/remote/studio.py,sha256=z9DTDqfdWKT8MC23wRDTOHvI8hc_OySS1Ce3F617gjA,9906
98
+ datachain/remote/studio.py,sha256=jp6NWo7OPUxqO8uYEHP0_XFlmj47rMxC80qKQ7rA3Xk,11024
100
99
  datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
101
100
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
102
101
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -117,9 +116,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
117
116
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
118
117
  datachain/toolkit/split.py,sha256=ZgDcrNiKiPXZmKD591_1z9qRIXitu5zwAsoVPB7ykiU,2508
119
118
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
120
- datachain-0.7.5.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
121
- datachain-0.7.5.dist-info/METADATA,sha256=GHZwnQPiDUBHJMdyMf-t7LJkMPyAkygNewihcXwETSs,18006
122
- datachain-0.7.5.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
123
- datachain-0.7.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
124
- datachain-0.7.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
125
- datachain-0.7.5.dist-info/RECORD,,
119
+ datachain-0.7.6.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
120
+ datachain-0.7.6.dist-info/METADATA,sha256=KMChqSG7d_lMaF9BYNIgmijvnxZbDm5gCEg980gUGOA,18006
121
+ datachain-0.7.6.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
122
+ datachain-0.7.6.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
123
+ datachain-0.7.6.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
124
+ datachain-0.7.6.dist-info/RECORD,,
@@ -1,136 +0,0 @@
1
- import logging
2
- from abc import ABC, abstractmethod
3
- from collections.abc import Iterable
4
- from functools import cached_property
5
- from typing import TYPE_CHECKING, Optional
6
-
7
- from sqlalchemy import Column, Integer, Table, Text
8
-
9
- from datachain.data_storage.serializer import Serializable
10
-
11
- if TYPE_CHECKING:
12
- from sqlalchemy.schema import SchemaItem
13
-
14
- from datachain.data_storage.db_engine import DatabaseEngine
15
-
16
-
17
- logger = logging.getLogger("datachain")
18
-
19
-
20
- class AbstractIDGenerator(ABC, Serializable):
21
- """
22
- Abstract ID Generator class. This class is responsible for generating
23
- unique IDs for each prefix (e.g. S3 bucket or dataset).
24
- """
25
-
26
- @abstractmethod
27
- def clone(self) -> "AbstractIDGenerator":
28
- """Clones AbstractIDGenerator implementation."""
29
-
30
- def init(self) -> None:
31
- """Initialize ID generator."""
32
-
33
- def cleanup_for_tests(self):
34
- """Cleanup for tests."""
35
-
36
- def close(self) -> None:
37
- """Closes any active database connections."""
38
-
39
- def close_on_exit(self) -> None:
40
- """Closes any active database or HTTP connections, called on Session exit or
41
- for test cleanup only, as some ID Generator implementations may handle this
42
- differently.
43
- """
44
- self.close()
45
-
46
- @abstractmethod
47
- def init_id(self, uri: str) -> None:
48
- """Initializes the ID generator for the given URI with zero last_id."""
49
-
50
- @abstractmethod
51
- def get_next_ids(self, uri: str, count: int) -> range:
52
- """Returns a range of IDs for the given URI."""
53
-
54
- def get_next_id(self, uri: str) -> int:
55
- """Returns the next ID for the given URI."""
56
- return self.get_next_ids(uri, 1)[0]
57
-
58
- def delete_uri(self, uri: str):
59
- """Deletes the given URI."""
60
- self.delete_uris([uri])
61
-
62
- def delete_uris(self, uris: Iterable[str]):
63
- """Deletes the given URIs."""
64
-
65
-
66
- class AbstractDBIDGenerator(AbstractIDGenerator):
67
- """
68
- Abstract ID Generator class, to be implemented by any Database Adapters
69
- for a specific database system. This class is responsible for generating
70
- unique IDs for each prefix (e.g. S3 bucket or dataset) and storing them
71
- in a database. It is also responsible for initializing the database
72
- and creating the necessary tables.
73
- """
74
-
75
- _db: "DatabaseEngine"
76
- _table_prefix: Optional[str] = None
77
- _skip_db_init: bool = False
78
- _base_table_name = "id_generator"
79
-
80
- def __init__(
81
- self,
82
- db: "DatabaseEngine",
83
- table_prefix: Optional[str] = None,
84
- skip_db_init: bool = False,
85
- ):
86
- self._db = db
87
- self._table_prefix = table_prefix
88
- self._skip_db_init = skip_db_init
89
- if db and not skip_db_init:
90
- self.init()
91
-
92
- @abstractmethod
93
- def clone(self) -> "AbstractDBIDGenerator":
94
- """Clones AbstractIDGenerator implementation."""
95
-
96
- def close(self) -> None:
97
- """Closes any active database connections."""
98
- self.db.close()
99
-
100
- @property
101
- def db(self) -> "DatabaseEngine":
102
- return self._db
103
-
104
- @property
105
- def _columns(self) -> list["SchemaItem"]:
106
- return [
107
- Column("uri", Text, primary_key=True, nullable=False),
108
- # This is the last id used (and starts at zero if no ids have been used)
109
- Column("last_id", Integer, nullable=False),
110
- ]
111
-
112
- @cached_property
113
- def _table(self) -> Table:
114
- table_name = self._base_table_name
115
- if self._table_prefix:
116
- table_name = f"{self._table_prefix}_{table_name}"
117
- return Table(table_name, self.db.metadata, *self._columns, extend_existing=True)
118
-
119
- def init(self) -> None:
120
- self.db.create_table(self._table, if_not_exists=True)
121
-
122
- def cleanup_for_tests(self):
123
- """Cleanup for tests."""
124
- self.db.drop_table(self._table, if_exists=True)
125
-
126
- @abstractmethod
127
- def init_id(self, uri: str) -> None:
128
- """Initializes the ID generator for the given URI with zero last_id."""
129
-
130
- @abstractmethod
131
- def get_next_ids(self, uri: str, count: int) -> range:
132
- """Returns a range of IDs for the given URI."""
133
-
134
- def delete_uris(self, uris: Iterable[str]):
135
- """Deletes the given URIs from the database."""
136
- self.db.execute(self._table.delete().where(self._table.c.uri.in_(uris)))