pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (110) hide show
  1. pixeltable/__init__.py +20 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +23 -7
  4. pixeltable/catalog/insertable_table.py +32 -19
  5. pixeltable/catalog/table.py +210 -20
  6. pixeltable/catalog/table_version.py +272 -111
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/dataframe.py +184 -110
  9. pixeltable/datatransfer/__init__.py +1 -0
  10. pixeltable/datatransfer/label_studio.py +526 -0
  11. pixeltable/datatransfer/remote.py +113 -0
  12. pixeltable/env.py +213 -79
  13. pixeltable/exec/__init__.py +2 -1
  14. pixeltable/exec/data_row_batch.py +6 -7
  15. pixeltable/exec/expr_eval_node.py +28 -28
  16. pixeltable/exec/sql_scan_node.py +7 -6
  17. pixeltable/exprs/__init__.py +4 -3
  18. pixeltable/exprs/column_ref.py +11 -2
  19. pixeltable/exprs/comparison.py +39 -1
  20. pixeltable/exprs/data_row.py +7 -0
  21. pixeltable/exprs/expr.py +26 -19
  22. pixeltable/exprs/function_call.py +17 -18
  23. pixeltable/exprs/globals.py +14 -2
  24. pixeltable/exprs/image_member_access.py +9 -28
  25. pixeltable/exprs/in_predicate.py +96 -0
  26. pixeltable/exprs/inline_array.py +13 -11
  27. pixeltable/exprs/inline_dict.py +15 -13
  28. pixeltable/exprs/row_builder.py +7 -1
  29. pixeltable/exprs/similarity_expr.py +67 -0
  30. pixeltable/ext/functions/whisperx.py +30 -0
  31. pixeltable/ext/functions/yolox.py +16 -0
  32. pixeltable/func/__init__.py +0 -2
  33. pixeltable/func/aggregate_function.py +5 -2
  34. pixeltable/func/callable_function.py +57 -13
  35. pixeltable/func/expr_template_function.py +14 -3
  36. pixeltable/func/function.py +35 -4
  37. pixeltable/func/signature.py +5 -15
  38. pixeltable/func/udf.py +8 -12
  39. pixeltable/functions/fireworks.py +9 -4
  40. pixeltable/functions/huggingface.py +48 -5
  41. pixeltable/functions/openai.py +49 -11
  42. pixeltable/functions/pil/image.py +61 -64
  43. pixeltable/functions/together.py +32 -6
  44. pixeltable/functions/util.py +0 -43
  45. pixeltable/functions/video.py +46 -8
  46. pixeltable/globals.py +443 -0
  47. pixeltable/index/__init__.py +1 -0
  48. pixeltable/index/base.py +9 -2
  49. pixeltable/index/btree.py +54 -0
  50. pixeltable/index/embedding_index.py +91 -15
  51. pixeltable/io/__init__.py +4 -0
  52. pixeltable/io/globals.py +59 -0
  53. pixeltable/{utils → io}/hf_datasets.py +48 -17
  54. pixeltable/io/pandas.py +148 -0
  55. pixeltable/{utils → io}/parquet.py +58 -33
  56. pixeltable/iterators/__init__.py +1 -1
  57. pixeltable/iterators/base.py +8 -4
  58. pixeltable/iterators/document.py +225 -93
  59. pixeltable/iterators/video.py +16 -9
  60. pixeltable/metadata/__init__.py +8 -4
  61. pixeltable/metadata/converters/convert_12.py +3 -0
  62. pixeltable/metadata/converters/convert_13.py +41 -0
  63. pixeltable/metadata/converters/convert_14.py +13 -0
  64. pixeltable/metadata/converters/convert_15.py +29 -0
  65. pixeltable/metadata/converters/util.py +63 -0
  66. pixeltable/metadata/schema.py +12 -6
  67. pixeltable/plan.py +11 -24
  68. pixeltable/store.py +16 -23
  69. pixeltable/tool/create_test_db_dump.py +49 -14
  70. pixeltable/type_system.py +27 -58
  71. pixeltable/utils/coco.py +94 -0
  72. pixeltable/utils/documents.py +42 -12
  73. pixeltable/utils/http_server.py +70 -0
  74. pixeltable-0.2.7.dist-info/METADATA +137 -0
  75. pixeltable-0.2.7.dist-info/RECORD +126 -0
  76. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
  77. pixeltable/client.py +0 -600
  78. pixeltable/exprs/image_similarity_predicate.py +0 -58
  79. pixeltable/func/batched_function.py +0 -53
  80. pixeltable/func/nos_function.py +0 -202
  81. pixeltable/tests/conftest.py +0 -171
  82. pixeltable/tests/ext/test_yolox.py +0 -21
  83. pixeltable/tests/functions/test_fireworks.py +0 -43
  84. pixeltable/tests/functions/test_functions.py +0 -60
  85. pixeltable/tests/functions/test_huggingface.py +0 -158
  86. pixeltable/tests/functions/test_openai.py +0 -162
  87. pixeltable/tests/functions/test_together.py +0 -112
  88. pixeltable/tests/test_audio.py +0 -65
  89. pixeltable/tests/test_catalog.py +0 -27
  90. pixeltable/tests/test_client.py +0 -21
  91. pixeltable/tests/test_component_view.py +0 -379
  92. pixeltable/tests/test_dataframe.py +0 -440
  93. pixeltable/tests/test_dirs.py +0 -107
  94. pixeltable/tests/test_document.py +0 -120
  95. pixeltable/tests/test_exprs.py +0 -802
  96. pixeltable/tests/test_function.py +0 -332
  97. pixeltable/tests/test_index.py +0 -138
  98. pixeltable/tests/test_migration.py +0 -44
  99. pixeltable/tests/test_nos.py +0 -54
  100. pixeltable/tests/test_snapshot.py +0 -231
  101. pixeltable/tests/test_table.py +0 -1343
  102. pixeltable/tests/test_transactional_directory.py +0 -42
  103. pixeltable/tests/test_types.py +0 -52
  104. pixeltable/tests/test_video.py +0 -159
  105. pixeltable/tests/test_view.py +0 -535
  106. pixeltable/tests/utils.py +0 -442
  107. pixeltable/utils/clip.py +0 -18
  108. pixeltable-0.2.5.dist-info/METADATA +0 -128
  109. pixeltable-0.2.5.dist-info/RECORD +0 -139
  110. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
@@ -106,9 +106,14 @@ class TableVersionPath:
106
106
  if self.base is not None:
107
107
  base_cols = self.base.columns()
108
108
  # we only include base columns that don't conflict with one of our column names
109
- result.extend([c for c in base_cols if c.name not in self.tbl_version.cols_by_name])
109
+ result.extend(c for c in base_cols if c.name not in self.tbl_version.cols_by_name)
110
110
  return result
111
111
 
112
+ def cols_by_name(self) -> dict[str, Column]:
113
+ """Return a dict of all user columns visible in this tbl version path, including columns from bases"""
114
+ cols = self.columns()
115
+ return {col.name: col for col in cols}
116
+
112
117
  def get_column(self, name: str, include_bases: bool = True) -> Optional[Column]:
113
118
  """Return the column with the given name, or None if not found"""
114
119
  col = self.tbl_version.cols_by_name.get(name)
pixeltable/dataframe.py CHANGED
@@ -26,18 +26,15 @@ from pixeltable.catalog import is_valid_identifier
26
26
  from pixeltable.env import Env
27
27
  from pixeltable.plan import Planner
28
28
  from pixeltable.type_system import ColumnType
29
+ from pixeltable.utils.http_server import get_file_uri
29
30
 
30
- __all__ = [
31
- 'DataFrame'
32
- ]
31
+ __all__ = ['DataFrame']
33
32
 
34
33
  _logger = logging.getLogger('pixeltable')
35
34
 
36
35
 
37
36
  def _create_source_tag(file_path: str) -> str:
38
- abs_path = Path(file_path)
39
- assert abs_path.is_absolute()
40
- src_url = f'{Env.get().http_address}/{abs_path}'
37
+ src_url = get_file_uri(Env.get().http_address, file_path)
41
38
  mime = mimetypes.guess_type(src_url)[0]
42
39
  # if mime is None, the attribute string would not be valid html.
43
40
  mime_attr = f'type="{mime}"' if mime is not None else ''
@@ -45,7 +42,6 @@ def _create_source_tag(file_path: str) -> str:
45
42
 
46
43
 
47
44
  class DataFrameResultSet:
48
-
49
45
  def __init__(self, rows: List[List[Any]], col_names: List[str], col_types: List[ColumnType]):
50
46
  self._rows = rows
51
47
  self._col_names = col_names
@@ -54,6 +50,7 @@ class DataFrameResultSet:
54
50
  ts.ImageType: self._format_img,
55
51
  ts.VideoType: self._format_video,
56
52
  ts.AudioType: self._format_audio,
53
+ ts.DocumentType: self._format_document,
57
54
  }
58
55
 
59
56
  def __len__(self) -> int:
@@ -90,7 +87,6 @@ class DataFrameResultSet:
90
87
  return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
91
88
 
92
89
  # Formatters
93
-
94
90
  def _format_img(self, img: Image.Image) -> str:
95
91
  """
96
92
  Create <img> tag for Image object.
@@ -106,14 +102,14 @@ class DataFrameResultSet:
106
102
  with io.BytesIO() as buffer:
107
103
  img.save(buffer, 'jpeg')
108
104
  img_base64 = base64.b64encode(buffer.getvalue()).decode()
109
- return f'''
110
- <div style="width:{width}px;">
105
+ return f"""
106
+ <div class="pxt_image" style="width:{width}px;">
111
107
  <img src="data:image/jpeg;base64,{img_base64}" width="{width}" />
112
108
  </div>
113
- '''
109
+ """
114
110
 
115
111
  def _format_video(self, file_path: str) -> str:
116
- thumb_tag = ""
112
+ thumb_tag = ''
117
113
  # Attempt to extract the first frame of the video to use as a thumbnail,
118
114
  # so that the notebook can be exported as HTML and viewed in contexts where
119
115
  # the video itself is not accessible.
@@ -136,16 +132,53 @@ class DataFrameResultSet:
136
132
  width = 480
137
133
  else:
138
134
  width = 800
139
- return f'''
140
- <div style="width:{width}px;">
135
+ return f"""
136
+ <div class="pxt_video" style="width:{width}px;">
141
137
  <video controls width="{width}" {thumb_tag}>
142
138
  {_create_source_tag(file_path)}
143
139
  </video>
144
140
  </div>
145
- '''
141
+ """
142
+
143
+ def _format_document(self, file_path: str) -> str:
144
+ max_width = max_height = 320
145
+ # by default, file path will be shown as a link
146
+ inner_element = file_path
147
+ # try generating a thumbnail for different types and use that if successful
148
+ if file_path.lower().endswith('.pdf'):
149
+ try:
150
+ import fitz
151
+
152
+ doc = fitz.open(file_path)
153
+ p = doc.get_page_pixmap(0)
154
+ while p.width > max_width or p.height > max_height:
155
+ # shrink(1) will halve each dimension
156
+ p.shrink(1)
157
+ data = p.tobytes(output='jpeg')
158
+ thumb_base64 = base64.b64encode(data).decode()
159
+ img_src = f'data:image/jpeg;base64,{thumb_base64}'
160
+ inner_element = f"""
161
+ <img style="object-fit: contain; border: 1px solid black;" src="{img_src}" />
162
+ """
163
+ except:
164
+ logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
165
+
166
+ return f"""
167
+ <div class="pxt_document" style="width:{max_width}px;">
168
+ <a href="{get_file_uri(Env.get().http_address, file_path)}">
169
+ {inner_element}
170
+ </a>
171
+ </div>
172
+ """
146
173
 
147
174
  def _format_audio(self, file_path: str) -> str:
148
- return f'<audio controls>{_create_source_tag(file_path)}</audio>'
175
+ return f"""
176
+ <div class="pxt_audio">
177
+ <audio controls>
178
+ {_create_source_tag(file_path)}
179
+ </audio>
180
+ </div>
181
+ """
149
182
 
150
183
  def __getitem__(self, index: Any) -> Any:
151
184
  if isinstance(index, str):
@@ -186,51 +219,53 @@ class DataFrameResultSetIterator:
186
219
  return row
187
220
 
188
221
 
189
- # TODO: remove this; it's only here as a reminder that we still need to call release() in the current implementation
190
- class AnalysisInfo:
191
- def __init__(self, tbl: catalog.TableVersion):
192
- self.tbl = tbl
193
- # output of the SQL scan stage
194
- self.sql_scan_output_exprs: List[exprs.Expr] = []
195
- # output of the agg stage
196
- self.agg_output_exprs: List[exprs.Expr] = []
197
- # Where clause of the Select stmt of the SQL scan stage
198
- self.sql_where_clause: Optional[sql.ClauseElement] = None
199
- # filter predicate applied to input rows of the SQL scan stage
200
- self.filter: Optional[exprs.Predicate] = None
201
- self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
202
- self.agg_fn_calls: List[exprs.FunctionCall] = [] # derived from unique_exprs
203
- self.has_frame_col: bool = False # True if we're referencing the frame col
204
-
205
- self.evaluator: Optional[exprs.Evaluator] = None
206
- self.sql_scan_eval_ctx: List[exprs.Expr] = [] # needed to materialize output of SQL scan stage
207
- self.agg_eval_ctx: List[exprs.Expr] = [] # needed to materialize output of agg stage
208
- self.filter_eval_ctx: List[exprs.Expr] = []
209
- self.group_by_eval_ctx: List[exprs.Expr] = []
210
-
211
- def finalize_exec(self) -> None:
212
- """
213
- Call release() on all collected Exprs.
214
- """
215
- exprs.Expr.release_list(self.sql_scan_output_exprs)
216
- exprs.Expr.release_list(self.agg_output_exprs)
217
- if self.filter is not None:
218
- self.filter.release()
222
+ # # TODO: remove this; it's only here as a reminder that we still need to call release() in the current implementation
223
+ # class AnalysisInfo:
224
+ # def __init__(self, tbl: catalog.TableVersion):
225
+ # self.tbl = tbl
226
+ # # output of the SQL scan stage
227
+ # self.sql_scan_output_exprs: List[exprs.Expr] = []
228
+ # # output of the agg stage
229
+ # self.agg_output_exprs: List[exprs.Expr] = []
230
+ # # Where clause of the Select stmt of the SQL scan stage
231
+ # self.sql_where_clause: Optional[sql.ClauseElement] = None
232
+ # # filter predicate applied to input rows of the SQL scan stage
233
+ # self.filter: Optional[exprs.Predicate] = None
234
+ # self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
235
+ # self.agg_fn_calls: List[exprs.FunctionCall] = [] # derived from unique_exprs
236
+ # self.has_frame_col: bool = False # True if we're referencing the frame col
237
+ #
238
+ # self.evaluator: Optional[exprs.Evaluator] = None
239
+ # self.sql_scan_eval_ctx: List[exprs.Expr] = [] # needed to materialize output of SQL scan stage
240
+ # self.agg_eval_ctx: List[exprs.Expr] = [] # needed to materialize output of agg stage
241
+ # self.filter_eval_ctx: List[exprs.Expr] = []
242
+ # self.group_by_eval_ctx: List[exprs.Expr] = []
243
+ #
244
+ # def finalize_exec(self) -> None:
245
+ # """
246
+ # Call release() on all collected Exprs.
247
+ # """
248
+ # exprs.Expr.release_list(self.sql_scan_output_exprs)
249
+ # exprs.Expr.release_list(self.agg_output_exprs)
250
+ # if self.filter is not None:
251
+ # self.filter.release()
219
252
 
220
253
 
221
254
  class DataFrame:
222
255
  def __init__(
223
- self, tbl: catalog.TableVersionPath,
224
- select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]] = None,
225
- where_clause: Optional[exprs.Predicate] = None,
226
- group_by_clause: Optional[List[exprs.Expr]] = None,
227
- grouping_tbl: Optional[catalog.TableVersion] = None,
228
- order_by_clause: Optional[List[Tuple[exprs.Expr, bool]]] = None, # List[(expr, asc)]
229
- limit: Optional[int] = None):
256
+ self,
257
+ tbl: catalog.TableVersionPath,
258
+ select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]] = None,
259
+ where_clause: Optional[exprs.Predicate] = None,
260
+ group_by_clause: Optional[List[exprs.Expr]] = None,
261
+ grouping_tbl: Optional[catalog.TableVersion] = None,
262
+ order_by_clause: Optional[List[Tuple[exprs.Expr, bool]]] = None, # List[(expr, asc)]
263
+ limit: Optional[int] = None,
264
+ ):
230
265
  self.tbl = tbl
231
266
 
232
267
  # select list logic
233
- DataFrame._select_list_check_rep(select_list) # check select list without expansion
268
+ DataFrame._select_list_check_rep(select_list) # check select list without expansion
234
269
  # exprs contain execution state and therefore cannot be shared
235
270
  select_list = copy.deepcopy(select_list)
236
271
  select_list_exprs, column_names = DataFrame._normalize_select_list(tbl, select_list)
@@ -249,12 +284,12 @@ class DataFrame:
249
284
  self.limit_val = limit
250
285
 
251
286
  @classmethod
252
- def _select_list_check_rep(cls,
287
+ def _select_list_check_rep(
288
+ cls,
253
289
  select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]],
254
290
  ) -> None:
255
- """Validate basic select list types.
256
- """
257
- if select_list is None: # basic check for valid select list
291
+ """Validate basic select list types."""
292
+ if select_list is None: # basic check for valid select list
258
293
  return
259
294
 
260
295
  assert len(select_list) > 0
@@ -267,13 +302,14 @@ class DataFrame:
267
302
  assert is_valid_identifier(ent[1])
268
303
 
269
304
  @classmethod
270
- def _normalize_select_list(cls,
305
+ def _normalize_select_list(
306
+ cls,
271
307
  tbl: catalog.TableVersionPath,
272
308
  select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]],
273
309
  ) -> Tuple[List[exprs.Expr], List[str]]:
274
310
  """
275
311
  Expand select list information with all columns and their names
276
- Returns:
312
+ Returns:
277
313
  a pair composed of the list of expressions and the list of corresponding names
278
314
  """
279
315
  if select_list is None:
@@ -281,9 +317,9 @@ class DataFrame:
281
317
  else:
282
318
  expanded_list = select_list
283
319
 
284
- out_exprs : List[exprs.Expr] = []
285
- out_names : List[str] = [] # keep track of order
286
- seen_out_names : set[str] = set() # use to check for duplicates in loop, avoid square complexity
320
+ out_exprs: List[exprs.Expr] = []
321
+ out_names: List[str] = [] # keep track of order
322
+ seen_out_names: set[str] = set() # use to check for duplicates in loop, avoid square complexity
287
323
  for i, (expr, name) in enumerate(expanded_list):
288
324
  if name is None:
289
325
  # use default, add suffix if needed so default adds no duplicates
@@ -292,13 +328,13 @@ class DataFrame:
292
328
  column_name = default_name
293
329
  if default_name in seen_out_names:
294
330
  # already used, then add suffix until unique name is found
295
- for j in range(1, len(out_names)+1):
331
+ for j in range(1, len(out_names) + 1):
296
332
  column_name = f'{default_name}_{j}'
297
333
  if column_name not in seen_out_names:
298
334
  break
299
- else: # no default name, eg some expressions
335
+ else: # no default name, eg some expressions
300
336
  column_name = f'col_{i}'
301
- else: # user provided name, no attempt to rename
337
+ else: # user provided name, no attempt to rename
302
338
  column_name = name
303
339
 
304
340
  out_exprs.append(expr)
@@ -326,9 +362,13 @@ class DataFrame:
326
362
  for item in self._select_list_exprs:
327
363
  item.bind_rel_paths(None)
328
364
  plan = Planner.create_query_plan(
329
- self.tbl, self._select_list_exprs, where_clause=self.where_clause, group_by_clause=group_by_clause,
365
+ self.tbl,
366
+ self._select_list_exprs,
367
+ where_clause=self.where_clause,
368
+ group_by_clause=group_by_clause,
330
369
  order_by_clause=self.order_by_clause if self.order_by_clause is not None else [],
331
- limit=self.limit_val if self.limit_val is not None else 0) # limit_val == 0: no limit_val
370
+ limit=self.limit_val if self.limit_val is not None else 0,
371
+ ) # limit_val == 0: no limit_val
332
372
 
333
373
  with Env.get().engine.begin() as conn:
334
374
  plan.ctx.conn = conn
@@ -374,12 +414,10 @@ class DataFrame:
374
414
  result_row = [data_row[e.slot_idx] for e in self._select_list_exprs]
375
415
  result_rows.append(result_row)
376
416
  except excs.ExprEvalError as e:
377
- msg = (f'In row {e.row_num} the {e.expr_msg} encountered exception '
378
- f'{type(e.exc).__name__}:\n{str(e.exc)}')
417
+ msg = f'In row {e.row_num} the {e.expr_msg} encountered exception ' f'{type(e.exc).__name__}:\n{str(e.exc)}'
379
418
  if len(e.input_vals) > 0:
380
419
  input_msgs = [
381
- f"'{d}' = {d.col_type.print_value(e.input_vals[i])}"
382
- for i, d in enumerate(e.expr.dependencies())
420
+ f"'{d}' = {d.col_type.print_value(e.input_vals[i])}" for i, d in enumerate(e.expr.dependencies())
383
421
  ]
384
422
  msg += f'\nwith {", ".join(input_msgs)}'
385
423
  assert e.exc_tb is not None
@@ -399,6 +437,7 @@ class DataFrame:
399
437
 
400
438
  def count(self) -> int:
401
439
  from pixeltable.plan import Planner
440
+
402
441
  stmt = Planner.create_count_stmt(self.tbl, self.where_clause)
403
442
  with Env.get().engine.connect() as conn:
404
443
  result: int = conn.execute(stmt).scalar_one()
@@ -424,9 +463,9 @@ class DataFrame:
424
463
  if self.order_by_clause is not None:
425
464
  heading_vals.append('Order By')
426
465
  heading_vals.extend([''] * (len(self.order_by_clause) - 1))
427
- info_vals.extend([
428
- f'{e[0].display_str(inline=False)} {"asc" if e[1] else "desc"}' for e in self.order_by_clause
429
- ])
466
+ info_vals.extend(
467
+ [f'{e[0].display_str(inline=False)} {"asc" if e[1] else "desc"}' for e in self.order_by_clause]
468
+ )
430
469
  if self.limit_val is not None:
431
470
  heading_vals.append('Limit')
432
471
  info_vals.append(str(self.limit_val))
@@ -440,9 +479,12 @@ class DataFrame:
440
479
  pd_df = self._description()
441
480
  # white-space: pre-wrap: print \n as newline
442
481
  # th: center-align headings
443
- return pd_df.style.set_properties(**{'white-space': 'pre-wrap', 'text-align': 'left'}) \
444
- .set_table_styles([dict(selector='th', props=[('text-align', 'center')])]) \
445
- .hide(axis='index').hide(axis='columns')
482
+ return (
483
+ pd_df.style.set_properties(**{'white-space': 'pre-wrap', 'text-align': 'left'})
484
+ .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
485
+ .hide(axis='index')
486
+ .hide(axis='columns')
487
+ )
446
488
 
447
489
  def describe(self) -> None:
448
490
  """
@@ -453,6 +495,7 @@ class DataFrame:
453
495
  try:
454
496
  __IPYTHON__
455
497
  from IPython.display import display
498
+
456
499
  display(self._description_html())
457
500
  except NameError:
458
501
  print(self.__repr__())
@@ -463,16 +506,16 @@ class DataFrame:
463
506
  def _repr_html_(self) -> str:
464
507
  return self._description_html()._repr_html_()
465
508
 
466
- def select(self, *items: Any, **named_items : Any) -> DataFrame:
509
+ def select(self, *items: Any, **named_items: Any) -> DataFrame:
467
510
  if self.select_list is not None:
468
511
  raise excs.Error(f'Select list already specified')
469
- for (name, _) in named_items.items():
512
+ for name, _ in named_items.items():
470
513
  if not isinstance(name, str) or not is_valid_identifier(name):
471
514
  raise excs.Error(f'Invalid name: {name}')
472
515
  base_list = [(expr, None) for expr in items] + [(expr, k) for (k, expr) in named_items.items()]
473
516
  if len(base_list) == 0:
474
517
  raise excs.Error(f'Empty select list')
475
-
518
+
476
519
  # analyze select list; wrap literals with the corresponding expressions
477
520
  select_list = []
478
521
  for raw_expr, name in base_list:
@@ -501,13 +544,25 @@ class DataFrame:
501
544
  seen.add(name)
502
545
 
503
546
  return DataFrame(
504
- self.tbl, select_list=select_list, where_clause=self.where_clause, group_by_clause=self.group_by_clause,
505
- grouping_tbl=self.grouping_tbl, order_by_clause=self.order_by_clause, limit=self.limit_val)
547
+ self.tbl,
548
+ select_list=select_list,
549
+ where_clause=self.where_clause,
550
+ group_by_clause=self.group_by_clause,
551
+ grouping_tbl=self.grouping_tbl,
552
+ order_by_clause=self.order_by_clause,
553
+ limit=self.limit_val,
554
+ )
506
555
 
507
556
  def where(self, pred: exprs.Predicate) -> DataFrame:
508
557
  return DataFrame(
509
- self.tbl, select_list=self.select_list, where_clause=pred, group_by_clause=self.group_by_clause,
510
- grouping_tbl=self.grouping_tbl, order_by_clause=self.order_by_clause, limit=self.limit_val)
558
+ self.tbl,
559
+ select_list=self.select_list,
560
+ where_clause=pred,
561
+ group_by_clause=self.group_by_clause,
562
+ grouping_tbl=self.grouping_tbl,
563
+ order_by_clause=self.order_by_clause,
564
+ limit=self.limit_val,
565
+ )
511
566
 
512
567
  def group_by(self, *grouping_items: Any) -> DataFrame:
513
568
  """Add a group-by clause to this DataFrame.
@@ -534,8 +589,14 @@ class DataFrame:
534
589
  if grouping_tbl is None:
535
590
  group_by_clause = list(grouping_items)
536
591
  return DataFrame(
537
- self.tbl, select_list=self.select_list, where_clause=self.where_clause, group_by_clause=group_by_clause,
538
- grouping_tbl=grouping_tbl, order_by_clause=self.order_by_clause, limit=self.limit_val)
592
+ self.tbl,
593
+ select_list=self.select_list,
594
+ where_clause=self.where_clause,
595
+ group_by_clause=group_by_clause,
596
+ grouping_tbl=grouping_tbl,
597
+ order_by_clause=self.order_by_clause,
598
+ limit=self.limit_val,
599
+ )
539
600
 
540
601
  def order_by(self, *expr_list: exprs.Expr, asc: bool = True) -> DataFrame:
541
602
  for e in expr_list:
@@ -544,16 +605,26 @@ class DataFrame:
544
605
  order_by_clause = self.order_by_clause if self.order_by_clause is not None else []
545
606
  order_by_clause.extend([(e.copy(), asc) for e in expr_list])
546
607
  return DataFrame(
547
- self.tbl, select_list=self.select_list, where_clause=self.where_clause,
548
- group_by_clause=self.group_by_clause, grouping_tbl=self.grouping_tbl, order_by_clause=order_by_clause,
549
- limit=self.limit_val)
608
+ self.tbl,
609
+ select_list=self.select_list,
610
+ where_clause=self.where_clause,
611
+ group_by_clause=self.group_by_clause,
612
+ grouping_tbl=self.grouping_tbl,
613
+ order_by_clause=order_by_clause,
614
+ limit=self.limit_val,
615
+ )
550
616
 
551
617
  def limit(self, n: int) -> DataFrame:
552
618
  assert n is not None and isinstance(n, int)
553
619
  return DataFrame(
554
- self.tbl, select_list=self.select_list, where_clause=self.where_clause,
555
- group_by_clause=self.group_by_clause, grouping_tbl=self.grouping_tbl, order_by_clause=self.order_by_clause,
556
- limit=n)
620
+ self.tbl,
621
+ select_list=self.select_list,
622
+ where_clause=self.where_clause,
623
+ group_by_clause=self.group_by_clause,
624
+ grouping_tbl=self.grouping_tbl,
625
+ order_by_clause=self.order_by_clause,
626
+ limit=n,
627
+ )
557
628
 
558
629
  def __getitem__(self, index: object) -> DataFrame:
559
630
  """
@@ -571,24 +642,27 @@ class DataFrame:
571
642
  if isinstance(index, list):
572
643
  return self.select(*index)
573
644
  raise TypeError(f'Invalid index type: {type(index)}')
574
-
645
+
575
646
  def _as_dict(self) -> Dict[str, Any]:
576
- """
577
- Returns:
578
- Dictionary representing this dataframe.
647
+ """
648
+ Returns:
649
+ Dictionary representing this dataframe.
579
650
  """
580
651
  tbl_versions = self.tbl.get_tbl_versions()
581
652
  d = {
582
653
  '_classname': 'DataFrame',
583
654
  'tbl_ids': [str(t.id) for t in tbl_versions],
584
655
  'tbl_versions': [t.version for t in tbl_versions],
585
- 'select_list':
586
- [(e.as_dict(), name) for (e, name) in self.select_list] if self.select_list is not None else None,
656
+ 'select_list': [(e.as_dict(), name) for (e, name) in self.select_list]
657
+ if self.select_list is not None
658
+ else None,
587
659
  'where_clause': self.where_clause.as_dict() if self.where_clause is not None else None,
588
- 'group_by_clause':
589
- [e.as_dict() for e in self.group_by_clause] if self.group_by_clause is not None else None,
590
- 'order_by_clause':
591
- [(e.as_dict(), asc) for (e,asc) in self.order_by_clause] if self.order_by_clause is not None else None,
660
+ 'group_by_clause': [e.as_dict() for e in self.group_by_clause]
661
+ if self.group_by_clause is not None
662
+ else None,
663
+ 'order_by_clause': [(e.as_dict(), asc) for (e, asc) in self.order_by_clause]
664
+ if self.order_by_clause is not None
665
+ else None,
592
666
  'limit_val': self.limit_val,
593
667
  }
594
668
  return d
@@ -615,7 +689,7 @@ class DataFrame:
615
689
  summary_string = json.dumps(self._as_dict())
616
690
  cache_key = hashlib.sha256(summary_string.encode()).hexdigest()
617
691
 
618
- dest_path = (Env.get().dataset_cache_dir / f'coco_{cache_key}')
692
+ dest_path = Env.get().dataset_cache_dir / f'coco_{cache_key}'
619
693
  if dest_path.exists():
620
694
  assert dest_path.is_dir()
621
695
  data_file_path = dest_path / 'data.json'
@@ -660,14 +734,14 @@ class DataFrame:
660
734
  Env.get().require_package('torch')
661
735
  Env.get().require_package('torchvision')
662
736
 
663
- from pixeltable.utils.parquet import save_parquet # pylint: disable=import-outside-toplevel
664
- from pixeltable.utils.pytorch import PixeltablePytorchDataset # pylint: disable=import-outside-toplevel
737
+ from pixeltable.io.parquet import save_parquet # pylint: disable=import-outside-toplevel
738
+ from pixeltable.utils.pytorch import PixeltablePytorchDataset # pylint: disable=import-outside-toplevel
665
739
 
666
- summary_string = json.dumps(self._as_dict())
740
+ summary_string = json.dumps(self._as_dict())
667
741
  cache_key = hashlib.sha256(summary_string.encode()).hexdigest()
668
-
669
- dest_path = (Env.get().dataset_cache_dir / f'df_{cache_key}').with_suffix('.parquet') # pylint: disable = protected-access
670
- if dest_path.exists(): # fast path: use cache
742
+
743
+ dest_path = (Env.get().dataset_cache_dir / f'df_{cache_key}').with_suffix('.parquet') # pylint: disable = protected-access
744
+ if dest_path.exists(): # fast path: use cache
671
745
  assert dest_path.is_dir()
672
746
  else:
673
747
  save_parquet(self, dest_path)
@@ -0,0 +1 @@
1
+ from .remote import Remote