pixeltable 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (52) hide show
  1. pixeltable/__init__.py +1 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/catalog.py +509 -103
  4. pixeltable/catalog/column.py +5 -0
  5. pixeltable/catalog/dir.py +15 -6
  6. pixeltable/catalog/globals.py +16 -0
  7. pixeltable/catalog/insertable_table.py +82 -41
  8. pixeltable/catalog/path.py +15 -0
  9. pixeltable/catalog/schema_object.py +7 -12
  10. pixeltable/catalog/table.py +81 -67
  11. pixeltable/catalog/table_version.py +23 -7
  12. pixeltable/catalog/view.py +9 -6
  13. pixeltable/env.py +15 -9
  14. pixeltable/exec/exec_node.py +1 -1
  15. pixeltable/exprs/__init__.py +2 -1
  16. pixeltable/exprs/arithmetic_expr.py +2 -0
  17. pixeltable/exprs/column_ref.py +38 -2
  18. pixeltable/exprs/expr.py +61 -12
  19. pixeltable/exprs/function_call.py +1 -4
  20. pixeltable/exprs/globals.py +12 -0
  21. pixeltable/exprs/json_mapper.py +4 -4
  22. pixeltable/exprs/json_path.py +10 -11
  23. pixeltable/exprs/similarity_expr.py +5 -20
  24. pixeltable/exprs/string_op.py +107 -0
  25. pixeltable/ext/functions/yolox.py +21 -64
  26. pixeltable/func/callable_function.py +5 -2
  27. pixeltable/func/query_template_function.py +6 -18
  28. pixeltable/func/tools.py +2 -2
  29. pixeltable/functions/__init__.py +1 -1
  30. pixeltable/functions/globals.py +16 -5
  31. pixeltable/globals.py +172 -262
  32. pixeltable/io/__init__.py +3 -2
  33. pixeltable/io/datarows.py +138 -0
  34. pixeltable/io/external_store.py +8 -5
  35. pixeltable/io/globals.py +7 -160
  36. pixeltable/io/hf_datasets.py +21 -98
  37. pixeltable/io/pandas.py +29 -43
  38. pixeltable/io/parquet.py +17 -42
  39. pixeltable/io/table_data_conduit.py +569 -0
  40. pixeltable/io/utils.py +6 -21
  41. pixeltable/metadata/__init__.py +1 -1
  42. pixeltable/metadata/converters/convert_30.py +50 -0
  43. pixeltable/metadata/converters/util.py +26 -1
  44. pixeltable/metadata/notes.py +1 -0
  45. pixeltable/metadata/schema.py +3 -0
  46. pixeltable/utils/arrow.py +32 -7
  47. pixeltable/utils/coroutine.py +41 -0
  48. {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/METADATA +1 -1
  49. {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/RECORD +52 -47
  50. {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/WHEEL +1 -1
  51. {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/LICENSE +0 -0
  52. {pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py CHANGED
@@ -1,17 +1,36 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
4
+ import os
2
5
  import urllib.parse
3
- from typing import Any, Iterable, Literal, Optional, Union, cast
4
- from uuid import UUID
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Union
5
8
 
6
9
  import pandas as pd
7
10
  from pandas.io.formats.style import Styler
8
11
 
9
- from pixeltable import DataFrame, catalog, env, exceptions as excs, exprs, func, share
10
- from pixeltable.catalog import Catalog, IfExistsParam, IfNotExistsParam
11
- from pixeltable.dataframe import DataFrameResultSet
12
+ from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
13
+ from pixeltable.catalog import Catalog, TableVersionPath
14
+ from pixeltable.catalog.insertable_table import OnErrorParameter
12
15
  from pixeltable.env import Env
13
16
  from pixeltable.iterators import ComponentIterator
14
- from pixeltable.utils.filecache import FileCache
17
+
18
+ if TYPE_CHECKING:
19
+ import datasets # type: ignore[import-untyped]
20
+
21
+ RowData = list[dict[str, Any]]
22
+ TableDataSource = Union[
23
+ str,
24
+ os.PathLike,
25
+ Path, # OS paths, filenames, URLs
26
+ Iterator[dict[str, Any]], # iterator producing dictionaries of values
27
+ RowData, # list of dictionaries
28
+ DataFrame, # Pixeltable DataFrame
29
+ pd.DataFrame, # pandas DataFrame
30
+ 'datasets.Dataset',
31
+ 'datasets.DatasetDict', # Huggingface datasets
32
+ ]
33
+
15
34
 
16
35
  _logger = logging.getLogger('pixeltable')
17
36
 
@@ -21,58 +40,36 @@ def init() -> None:
21
40
  _ = Catalog.get()
22
41
 
23
42
 
24
- def _handle_path_collision(
25
- path: str, expected_obj_type: type[catalog.SchemaObject], expected_snapshot: bool, if_exists: catalog.IfExistsParam
26
- ) -> Optional[catalog.SchemaObject]:
27
- cat = Catalog.get()
28
- obj: Optional[catalog.SchemaObject]
29
- if if_exists == catalog.IfExistsParam.ERROR:
30
- _ = cat.get_schema_object(path, raise_if_exists=True)
31
- obj = None
32
- else:
33
- obj = cat.get_schema_object(path)
34
- is_snapshot = isinstance(obj, catalog.View) and obj._tbl_version_path.is_snapshot()
35
- if obj is not None and (not isinstance(obj, expected_obj_type) or (expected_snapshot and not is_snapshot)):
36
- obj_type_str = 'snapshot' if expected_snapshot else expected_obj_type._display_name()
37
- raise excs.Error(
38
- f'Path {path!r} already exists but is not a {obj_type_str}. Cannot {if_exists.name.lower()} it.'
39
- )
40
- if obj is None:
41
- return None
42
-
43
- if if_exists == IfExistsParam.IGNORE:
44
- return obj
45
-
46
- # drop the existing schema object
47
- if isinstance(obj, catalog.Dir):
48
- dir_contents = cat.get_dir_contents(obj._id)
49
- if len(dir_contents) > 0 and if_exists == IfExistsParam.REPLACE:
50
- raise excs.Error(
51
- f'Directory {path!r} already exists and is not empty. Use `if_exists="replace_force"` to replace it.'
52
- )
53
- _drop_dir(obj._id, path, force=True)
54
- else:
55
- assert isinstance(obj, catalog.Table)
56
- _drop_table(obj, force=if_exists == IfExistsParam.REPLACE_FORCE, is_replace=True)
57
- return None
58
-
59
-
60
43
  def create_table(
61
44
  path_str: str,
62
- schema_or_df: Union[dict[str, Any], DataFrame],
45
+ schema: Optional[dict[str, Any]] = None,
63
46
  *,
47
+ source: Optional[TableDataSource] = None,
48
+ source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
49
+ schema_overrides: Optional[dict[str, Any]] = None,
50
+ on_error: Literal['abort', 'ignore'] = 'abort',
64
51
  primary_key: Optional[Union[str, list[str]]] = None,
65
52
  num_retained_versions: int = 10,
66
53
  comment: str = '',
67
54
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
68
55
  if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
56
+ extra_args: Optional[dict[str, Any]] = None, # Additional arguments to data source provider
69
57
  ) -> catalog.Table:
70
58
  """Create a new base table.
71
59
 
72
60
  Args:
73
61
  path_str: Path to the table.
74
- schema_or_df: Either a dictionary that maps column names to column types, or a
75
- [`DataFrame`][pixeltable.DataFrame] whose contents and schema will be used to pre-populate the table.
62
+ schema: A dictionary that maps column names to column types
63
+ source: A data source from which a table schema can be inferred and data imported
64
+ source_format: A hint to the format of the source data
65
+ schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
66
+ on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
67
+ invalid media file (such as a corrupt image) for one of the inserted rows.
68
+
69
+ - If `on_error='abort'`, then an exception will be raised and the rows will not be inserted.
70
+ - If `on_error='ignore'`, then execution will continue and the rows will be inserted. Any cells
71
+ with errors will have a `None` value for that cell, with information about the error stored in the
72
+ corresponding `tbl.col_name.errortype` and `tbl.col_name.errormsg` fields.
76
73
  primary_key: An optional column name or list of column names to use as the primary key(s) of the
77
74
  table.
78
75
  num_retained_versions: Number of versions of the table to retain.
@@ -88,6 +85,7 @@ def create_table(
88
85
  - `'ignore'`: do nothing and return the existing table handle
89
86
  - `'replace'`: if the existing table has no views, drop and replace it with a new one
90
87
  - `'replace_force'`: drop the existing table and all its views, and create a new one
88
+ extra_args: Additional arguments to pass to the source data provider
91
89
 
92
90
  Returns:
93
91
  A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
@@ -99,7 +97,8 @@ def create_table(
99
97
  - the path is invalid, or
100
98
  - the path already exists and `if_exists='error'`, or
101
99
  - the path already exists and is not a table, or
102
- - an error occurs while attempting to create the table.
100
+ - an error occurs while attempting to create the table, or
101
+ - an error occurs while attempting to import data from the source.
103
102
 
104
103
  Examples:
105
104
  Create a table with an int and a string column:
@@ -119,60 +118,64 @@ def create_table(
119
118
  Create a table with an int and a float column, and replace any existing table:
120
119
 
121
120
  >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.Float}, if_exists='replace')
121
+
122
+ Create a table from a CSV file:
123
+
124
+ >>> tbl = pxt.create_table('my_table', source='data.csv')
122
125
  """
123
- path = catalog.Path(path_str)
124
- cat = Catalog.get()
126
+ from pixeltable.io.table_data_conduit import DFTableDataConduit, UnkTableDataConduit
127
+ from pixeltable.io.utils import normalize_primary_key_parameter
128
+
129
+ if (schema is None) == (source is None):
130
+ raise excs.Error('Must provide either a `schema` or a `source`')
131
+
132
+ if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
133
+ raise excs.Error('`schema` must be a non-empty dictionary')
134
+
135
+ path_obj = catalog.Path(path_str)
136
+ if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
137
+ media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
138
+ primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
139
+ table: catalog.Table = None
140
+ tds = None
141
+ data_source = None
142
+ if source is not None:
143
+ tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
144
+ tds.check_source_format()
145
+ data_source = tds.specialize()
146
+ data_source.src_schema_overrides = schema_overrides
147
+ data_source.src_pk = primary_key
148
+ data_source.infer_schema()
149
+ schema = data_source.pxt_schema
150
+ primary_key = data_source.pxt_pk
151
+ is_direct_df = data_source.is_direct_df()
152
+ else:
153
+ is_direct_df = False
125
154
 
126
- with env.Env.get().begin_xact():
127
- if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
128
- existing = _handle_path_collision(path_str, catalog.InsertableTable, False, if_exists_)
129
- if existing is not None:
130
- assert isinstance(existing, catalog.Table)
131
- return existing
132
-
133
- dir = cat.get_schema_object(str(path.parent), expected=catalog.Dir, raise_if_not_exists=True)
134
- assert dir is not None
135
-
136
- df: Optional[DataFrame] = None
137
- if isinstance(schema_or_df, dict):
138
- schema = schema_or_df
139
- elif isinstance(schema_or_df, DataFrame):
140
- df = schema_or_df
141
- schema = df.schema
142
- elif isinstance(schema_or_df, DataFrameResultSet):
143
- raise excs.Error(
144
- '`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame. '
145
- '(Is there an extraneous call to `collect()`?)'
146
- )
147
- else:
148
- raise excs.Error('`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame.')
149
-
150
- if len(schema) == 0:
151
- raise excs.Error(f'Table schema is empty: `{path_str}`')
152
-
153
- if primary_key is None:
154
- primary_key = []
155
- elif isinstance(primary_key, str):
156
- primary_key = [primary_key]
157
- elif not isinstance(primary_key, list) or not all(isinstance(pk, str) for pk in primary_key):
158
- raise excs.Error('primary_key must be a single column name or a list of column names')
159
-
160
- tbl = catalog.InsertableTable._create(
161
- dir._id,
162
- path.name,
163
- schema,
164
- df,
165
- primary_key=primary_key,
166
- num_retained_versions=num_retained_versions,
167
- comment=comment,
168
- media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'),
155
+ if len(schema) == 0 or not isinstance(schema, dict):
156
+ raise excs.Error(
157
+ 'Unable to create a proper schema from supplied `source`. Please use appropriate `schema_overrides`.'
169
158
  )
170
- cat.add_tbl(tbl)
171
- return tbl
159
+
160
+ table = Catalog.get().create_table(
161
+ path_obj,
162
+ schema,
163
+ data_source.pxt_df if isinstance(data_source, DFTableDataConduit) else None,
164
+ if_exists=if_exists_,
165
+ primary_key=primary_key,
166
+ comment=comment,
167
+ media_validation=media_validation_,
168
+ num_retained_versions=num_retained_versions,
169
+ )
170
+ if data_source is not None and not is_direct_df:
171
+ fail_on_exception = OnErrorParameter.fail_on_exception(on_error)
172
+ table.insert_table_data_source(data_source=data_source, fail_on_exception=fail_on_exception)
173
+
174
+ return table
172
175
 
173
176
 
174
177
  def create_view(
175
- path_str: str,
178
+ path: str,
176
179
  base: Union[catalog.Table, DataFrame],
177
180
  *,
178
181
  additional_columns: Optional[dict[str, Any]] = None,
@@ -186,7 +189,7 @@ def create_view(
186
189
  """Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
187
190
 
188
191
  Args:
189
- path_str: A name for the view; can be either a simple name such as `my_view`, or a pathname such as
192
+ path: A name for the view; can be either a simple name such as `my_view`, or a pathname such as
190
193
  `dir1.my_view`.
191
194
  base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`DataFrame`][pixeltable.DataFrame] to
192
195
  base the view on.
@@ -242,8 +245,9 @@ def create_view(
242
245
  >>> tbl = pxt.get_table('my_table')
243
246
  ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 100), if_exists='replace_force')
244
247
  """
245
- where: Optional[exprs.Expr] = None
248
+ tbl_version_path: TableVersionPath
246
249
  select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]] = None
250
+ where: Optional[exprs.Expr] = None
247
251
  if isinstance(base, catalog.Table):
248
252
  tbl_version_path = base._tbl_version_path
249
253
  elif isinstance(base, DataFrame):
@@ -257,51 +261,34 @@ def create_view(
257
261
  raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
258
262
  assert isinstance(base, (catalog.Table, DataFrame))
259
263
 
260
- path = catalog.Path(path_str)
261
- cat = Catalog.get()
264
+ path_obj = catalog.Path(path)
265
+ if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
266
+ media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
262
267
 
263
- with Env.get().begin_xact():
264
- if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
265
- existing = _handle_path_collision(path_str, catalog.View, is_snapshot, if_exists_)
266
- if existing is not None:
267
- assert isinstance(existing, catalog.View)
268
- return existing
269
-
270
- dir = cat.get_schema_object(str(path.parent), expected=catalog.Dir, raise_if_not_exists=True)
271
- assert dir is not None
272
-
273
- if additional_columns is None:
274
- additional_columns = {}
275
- else:
276
- # additional columns should not be in the base table
277
- for col_name in additional_columns:
278
- if col_name in [c.name for c in tbl_version_path.columns()]:
279
- raise excs.Error(
280
- f'Column {col_name!r} already exists in the base table '
281
- f'{tbl_version_path.get_column(col_name).tbl.get().name}.'
282
- )
283
- if iterator is None:
284
- iterator_class, iterator_args = None, None
285
- else:
286
- iterator_class, iterator_args = iterator
287
-
288
- view = catalog.View._create(
289
- dir._id,
290
- path.name,
291
- base=tbl_version_path,
292
- select_list=select_list,
293
- additional_columns=additional_columns,
294
- predicate=where,
295
- is_snapshot=is_snapshot,
296
- iterator_cls=iterator_class,
297
- iterator_args=iterator_args,
298
- num_retained_versions=num_retained_versions,
299
- comment=comment,
300
- media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'),
301
- )
302
- FileCache.get().emit_eviction_warnings()
303
- cat.add_tbl(view)
304
- return view
268
+ if additional_columns is None:
269
+ additional_columns = {}
270
+ else:
271
+ # additional columns should not be in the base table
272
+ for col_name in additional_columns:
273
+ if col_name in [c.name for c in tbl_version_path.columns()]:
274
+ raise excs.Error(
275
+ f'Column {col_name!r} already exists in the base table '
276
+ f'{tbl_version_path.get_column(col_name).tbl.get().name}.'
277
+ )
278
+
279
+ return Catalog.get().create_view(
280
+ path_obj,
281
+ tbl_version_path,
282
+ select_list=select_list,
283
+ where=where,
284
+ additional_columns=additional_columns,
285
+ is_snapshot=is_snapshot,
286
+ iterator=iterator,
287
+ num_retained_versions=num_retained_versions,
288
+ comment=comment,
289
+ media_validation=media_validation_,
290
+ if_exists=if_exists_,
291
+ )
305
292
 
306
293
 
307
294
  def create_snapshot(
@@ -410,11 +397,8 @@ def get_table(path: str) -> catalog.Table:
410
397
 
411
398
  >>> tbl = pxt.get_table('my_snapshot')
412
399
  """
413
- with Env.get().begin_xact():
414
- obj = Catalog.get().get_schema_object(path, expected=catalog.Table, raise_if_not_exists=True)
415
- assert isinstance(obj, catalog.Table)
416
- obj.ensure_md_loaded()
417
- return obj
400
+ path_obj = catalog.Path(path)
401
+ return Catalog.get().get_table(path_obj)
418
402
 
419
403
 
420
404
  def move(path: str, new_path: str) -> None:
@@ -436,14 +420,13 @@ def move(path: str, new_path: str) -> None:
436
420
 
437
421
  >>>> pxt.move('dir1.my_table', 'dir1.new_name')
438
422
  """
423
+ if path == new_path:
424
+ raise excs.Error('move(): source and destination cannot be identical')
425
+ path_obj, new_path_obj = catalog.Path(path), catalog.Path(new_path)
426
+ if path_obj.is_ancestor(new_path_obj):
427
+ raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
439
428
  cat = Catalog.get()
440
- with Env.get().begin_xact():
441
- obj = cat.get_schema_object(path, raise_if_not_exists=True)
442
- new_p = catalog.Path(new_path)
443
- dest_dir_path = str(new_p.parent)
444
- dest_dir = cat.get_schema_object(dest_dir_path, expected=catalog.Dir, raise_if_not_exists=True)
445
- _ = cat.get_schema_object(new_path, raise_if_exists=True)
446
- obj._move(new_p.name, dest_dir._id)
429
+ cat.move(path_obj, new_path_obj)
447
430
 
448
431
 
449
432
  def drop_table(
@@ -482,50 +465,19 @@ def drop_table(
482
465
  Drop a table and all its dependents:
483
466
  >>> pxt.drop_table('subdir.my_table', force=True)
484
467
  """
485
- cat = Catalog.get()
486
- tbl: Optional[catalog.Table]
487
- with Env.get().begin_xact():
488
- if isinstance(table, str):
489
- _ = catalog.Path(table) # validate path
490
- if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
491
- tbl = cast(
492
- Optional[catalog.Table],
493
- cat.get_schema_object(
494
- table,
495
- expected=catalog.Table,
496
- raise_if_not_exists=if_not_exists_ == IfNotExistsParam.ERROR and not force,
497
- ),
498
- )
499
- if tbl is None:
500
- _logger.info(f'Skipped table `{table}` (does not exist).')
501
- return
502
- else:
503
- tbl = table
504
- _drop_table(tbl, force=force, is_replace=False)
505
-
506
-
507
- def _drop_table(tbl: catalog.Table, force: bool, is_replace: bool) -> None:
508
- cat = Catalog.get()
509
- view_ids = cat.get_views(tbl._id)
510
- if len(view_ids) > 0:
511
- view_paths = [cat.get_tbl_path(id) for id in view_ids]
512
- if force:
513
- for view_path in view_paths:
514
- drop_table(view_path, force=True)
515
- else:
516
- is_snapshot = tbl._tbl_version_path.is_snapshot()
517
- obj_type_str = 'Snapshot' if is_snapshot else tbl._display_name().capitalize()
518
- msg: str
519
- if is_replace:
520
- msg = (
521
- f'{obj_type_str} {tbl._path()} already exists and has dependents: {", ".join(view_paths)}. '
522
- "Use `if_exists='replace_force'` to replace it."
523
- )
524
- else:
525
- msg = f'{obj_type_str} {tbl._path()} has dependents: {", ".join(view_paths)}'
526
- raise excs.Error(msg)
527
- tbl._drop()
528
- _logger.info(f'Dropped table `{tbl._path()}`.')
468
+ tbl_path: str
469
+ if isinstance(table, catalog.Table):
470
+ # if we're dropping a table by handle, we first need to get the current path, then drop the S lock on
471
+ # the Table record, and then get X locks in the correct order (first containing directory, then table)
472
+ with Env.get().begin_xact():
473
+ tbl_path = table._path()
474
+ else:
475
+ assert isinstance(table, str)
476
+ tbl_path = table
477
+
478
+ path_obj = catalog.Path(tbl_path)
479
+ if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
480
+ Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
529
481
 
530
482
 
531
483
  def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
@@ -551,16 +503,14 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
551
503
 
552
504
  >>> pxt.list_tables('dir1')
553
505
  """
554
- _ = catalog.Path(dir_path, empty_is_valid=True) # validate format
506
+ path_obj = catalog.Path(dir_path, empty_is_valid=True) # validate format
555
507
  cat = Catalog.get()
556
- with Env.get().begin_xact():
557
- dir = cat.get_schema_object(dir_path, expected=catalog.Dir, raise_if_not_exists=True)
558
- contents = cat.get_dir_contents(dir._id, recursive=recursive)
559
- return _extract_paths(contents, prefix=dir_path, entry_type=catalog.Table)
508
+ contents = cat.get_dir_contents(path_obj, recursive=recursive)
509
+ return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
560
510
 
561
511
 
562
512
  def create_dir(
563
- path: str, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error'
513
+ path: str, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error', parents: bool = False
564
514
  ) -> Optional[catalog.Dir]:
565
515
  """Create a directory.
566
516
 
@@ -573,6 +523,7 @@ def create_dir(
573
523
  - `'ignore'`: do nothing and return the existing directory handle
574
524
  - `'replace'`: if the existing directory is empty, drop it and create a new one
575
525
  - `'replace_force'`: drop the existing directory and all its children, and create a new one
526
+ parents: Create missing parent directories.
576
527
 
577
528
  Returns:
578
529
  A handle to the newly created directory, or to an already existing directory at the path when
@@ -600,22 +551,14 @@ def create_dir(
600
551
  Create a directory and replace if it already exists:
601
552
 
602
553
  >>> pxt.create_dir('my_dir', if_exists='replace_force')
603
- """
604
- path_obj = catalog.Path(path)
605
- cat = Catalog.get()
606
554
 
607
- with env.Env.get().begin_xact():
608
- if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
609
- existing = _handle_path_collision(path, catalog.Dir, False, if_exists_)
610
- if existing is not None:
611
- assert isinstance(existing, catalog.Dir)
612
- return existing
555
+ Create a subdirectory along with its ancestors:
613
556
 
614
- parent = cat.get_schema_object(str(path_obj.parent))
615
- assert parent is not None
616
- dir = catalog.Dir._create(parent._id, path_obj.name)
617
- Env.get().console_logger.info(f'Created directory {path!r}.')
618
- return dir
557
+ >>> pxt.create_dir('parent1.parent2.sub_dir', parents=True)
558
+ """
559
+ path_obj = catalog.Path(path)
560
+ if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
561
+ return Catalog.get().create_dir(path_obj, if_exists=if_exists_, parents=parents)
619
562
 
620
563
 
621
564
  def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
@@ -655,47 +598,16 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
655
598
 
656
599
  >>> pxt.drop_dir('my_dir', force=True)
657
600
  """
658
- _ = catalog.Path(path) # validate format
659
- cat = Catalog.get()
601
+ path_obj = catalog.Path(path) # validate format
660
602
  if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
661
- with Env.get().begin_xact():
662
- dir = cat.get_schema_object(
663
- path,
664
- expected=catalog.Dir,
665
- raise_if_not_exists=if_not_exists_ == catalog.IfNotExistsParam.ERROR and not force,
666
- )
667
- if dir is None:
668
- _logger.info(f'Directory {path!r} does not exist, skipped drop_dir().')
669
- return
670
- _drop_dir(dir._id, path, force=force)
671
-
672
-
673
- def _drop_dir(dir_id: UUID, path: str, force: bool = False) -> None:
674
- cat = Catalog.get()
675
- dir_entries = cat.get_dir_contents(dir_id, recursive=False)
676
- if len(dir_entries) > 0 and not force:
677
- raise excs.Error(f'Directory {path!r} is not empty.')
678
- tbl_paths = [_join_path(path, entry.table.md['name']) for entry in dir_entries.values() if entry.table is not None]
679
- dir_paths = [_join_path(path, entry.dir.md['name']) for entry in dir_entries.values() if entry.dir is not None]
680
-
681
- for tbl_path in tbl_paths:
682
- # check if the table still exists, it might be a view that already got force-deleted
683
- if cat.get_schema_object(tbl_path, expected=catalog.Table, raise_if_not_exists=False) is not None:
684
- drop_table(tbl_path, force=True)
685
- for dir_path in dir_paths:
686
- drop_dir(dir_path, force=True)
687
- cat.drop_dir(dir_id)
688
- _logger.info(f'Removed directory {path!r}.')
689
-
690
-
691
- def _join_path(path: str, name: str) -> str:
692
- """Append name to path, if path is not empty."""
693
- return f'{path}.{name}' if path else name
603
+ Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
694
604
 
695
605
 
696
606
  def _extract_paths(
697
- dir_entries: dict[str, Catalog.DirEntry], prefix: str, entry_type: Optional[type[catalog.SchemaObject]] = None
698
- ) -> list[str]:
607
+ dir_entries: dict[str, Catalog.DirEntry],
608
+ parent: catalog.Path,
609
+ entry_type: Optional[type[catalog.SchemaObject]] = None,
610
+ ) -> list[catalog.Path]:
699
611
  """Convert nested dir_entries structure to a flattened list of paths."""
700
612
  matches: list[str]
701
613
  if entry_type is None:
@@ -704,9 +616,9 @@ def _extract_paths(
704
616
  matches = [name for name, entry in dir_entries.items() if entry.dir is not None]
705
617
  else:
706
618
  matches = [name for name, entry in dir_entries.items() if entry.table is not None]
707
- result = [_join_path(prefix, name) for name in matches]
619
+ result = [parent.append(name) for name in matches]
708
620
  for name, entry in [(name, entry) for name, entry in dir_entries.items() if len(entry.dir_entries) > 0]:
709
- result.extend(_extract_paths(entry.dir_entries, prefix=_join_path(prefix, name), entry_type=entry_type))
621
+ result.extend(_extract_paths(entry.dir_entries, parent=parent.append(name), entry_type=entry_type))
710
622
  return result
711
623
 
712
624
 
@@ -717,11 +629,11 @@ def publish_snapshot(dest_uri: str, table: catalog.Table) -> None:
717
629
  share.publish_snapshot(dest_uri, table)
718
630
 
719
631
 
720
- def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
632
+ def list_dirs(path: str = '', recursive: bool = True) -> list[str]:
721
633
  """List the directories in a directory.
722
634
 
723
635
  Args:
724
- path_str: Name or path of the directory.
636
+ path: Name or path of the directory.
725
637
  recursive: If `True`, lists all descendants of this directory recursively.
726
638
 
727
639
  Returns:
@@ -734,12 +646,10 @@ def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
734
646
  >>> cl.list_dirs('my_dir', recursive=True)
735
647
  ['my_dir', 'my_dir.sub_dir1']
736
648
  """
737
- _ = catalog.Path(path_str, empty_is_valid=True) # validate format
649
+ path_obj = catalog.Path(path, empty_is_valid=True) # validate format
738
650
  cat = Catalog.get()
739
- with Env.get().begin_xact():
740
- dir = cat.get_schema_object(path_str, expected=catalog.Dir, raise_if_not_exists=True)
741
- contents = cat.get_dir_contents(dir._id, recursive=recursive)
742
- return _extract_paths(contents, prefix=path_str, entry_type=catalog.Dir)
651
+ contents = cat.get_dir_contents(path_obj, recursive=recursive)
652
+ return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Dir)]
743
653
 
744
654
 
745
655
  def list_functions() -> Styler:
pixeltable/io/__init__.py CHANGED
@@ -1,11 +1,12 @@
1
+ from .datarows import import_json, import_rows
1
2
  from .external_store import ExternalStore, SyncStatus
2
- from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
3
+ from .globals import create_label_studio_project, export_images_as_fo_dataset
3
4
  from .hf_datasets import import_huggingface_dataset
4
5
  from .pandas import import_csv, import_excel, import_pandas
5
6
  from .parquet import export_parquet, import_parquet
6
7
 
7
8
  __default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
8
- __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}
9
+ __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
9
10
  __all__ = sorted(list(__default_dir - __removed_symbols))
10
11
 
11
12