pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +20 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +23 -7
- pixeltable/catalog/insertable_table.py +32 -19
- pixeltable/catalog/table.py +210 -20
- pixeltable/catalog/table_version.py +272 -111
- pixeltable/catalog/table_version_path.py +6 -1
- pixeltable/dataframe.py +184 -110
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +526 -0
- pixeltable/datatransfer/remote.py +113 -0
- pixeltable/env.py +213 -79
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +11 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +26 -19
- pixeltable/exprs/function_call.py +17 -18
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/row_builder.py +7 -1
- pixeltable/exprs/similarity_expr.py +67 -0
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +5 -2
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +14 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/signature.py +5 -15
- pixeltable/func/udf.py +8 -12
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +48 -5
- pixeltable/functions/openai.py +49 -11
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +32 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +443 -0
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +9 -2
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +91 -15
- pixeltable/io/__init__.py +4 -0
- pixeltable/io/globals.py +59 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +8 -4
- pixeltable/iterators/document.py +225 -93
- pixeltable/iterators/video.py +16 -9
- pixeltable/metadata/__init__.py +8 -4
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/converters/convert_15.py +29 -0
- pixeltable/metadata/converters/util.py +63 -0
- pixeltable/metadata/schema.py +12 -6
- pixeltable/plan.py +11 -24
- pixeltable/store.py +16 -23
- pixeltable/tool/create_test_db_dump.py +49 -14
- pixeltable/type_system.py +27 -58
- pixeltable/utils/coco.py +94 -0
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- pixeltable-0.2.7.dist-info/METADATA +137 -0
- pixeltable-0.2.7.dist-info/RECORD +126 -0
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -600
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/func/nos_function.py +0 -202
- pixeltable/tests/conftest.py +0 -171
- pixeltable/tests/ext/test_yolox.py +0 -21
- pixeltable/tests/functions/test_fireworks.py +0 -43
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -162
- pixeltable/tests/functions/test_together.py +0 -112
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -379
- pixeltable/tests/test_dataframe.py +0 -440
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -802
- pixeltable/tests/test_function.py +0 -332
- pixeltable/tests/test_index.py +0 -138
- pixeltable/tests/test_migration.py +0 -44
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -231
- pixeltable/tests/test_table.py +0 -1343
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -52
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -535
- pixeltable/tests/utils.py +0 -442
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.5.dist-info/METADATA +0 -128
- pixeltable-0.2.5.dist-info/RECORD +0 -139
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
|
@@ -106,9 +106,14 @@ class TableVersionPath:
|
|
|
106
106
|
if self.base is not None:
|
|
107
107
|
base_cols = self.base.columns()
|
|
108
108
|
# we only include base columns that don't conflict with one of our column names
|
|
109
|
-
result.extend(
|
|
109
|
+
result.extend(c for c in base_cols if c.name not in self.tbl_version.cols_by_name)
|
|
110
110
|
return result
|
|
111
111
|
|
|
112
|
+
def cols_by_name(self) -> dict[str, Column]:
|
|
113
|
+
"""Return a dict of all user columns visible in this tbl version path, including columns from bases"""
|
|
114
|
+
cols = self.columns()
|
|
115
|
+
return {col.name: col for col in cols}
|
|
116
|
+
|
|
112
117
|
def get_column(self, name: str, include_bases: bool = True) -> Optional[Column]:
|
|
113
118
|
"""Return the column with the given name, or None if not found"""
|
|
114
119
|
col = self.tbl_version.cols_by_name.get(name)
|
pixeltable/dataframe.py
CHANGED
|
@@ -26,18 +26,15 @@ from pixeltable.catalog import is_valid_identifier
|
|
|
26
26
|
from pixeltable.env import Env
|
|
27
27
|
from pixeltable.plan import Planner
|
|
28
28
|
from pixeltable.type_system import ColumnType
|
|
29
|
+
from pixeltable.utils.http_server import get_file_uri
|
|
29
30
|
|
|
30
|
-
__all__ = [
|
|
31
|
-
'DataFrame'
|
|
32
|
-
]
|
|
31
|
+
__all__ = ['DataFrame']
|
|
33
32
|
|
|
34
33
|
_logger = logging.getLogger('pixeltable')
|
|
35
34
|
|
|
36
35
|
|
|
37
36
|
def _create_source_tag(file_path: str) -> str:
|
|
38
|
-
|
|
39
|
-
assert abs_path.is_absolute()
|
|
40
|
-
src_url = f'{Env.get().http_address}/{abs_path}'
|
|
37
|
+
src_url = get_file_uri(Env.get().http_address, file_path)
|
|
41
38
|
mime = mimetypes.guess_type(src_url)[0]
|
|
42
39
|
# if mime is None, the attribute string would not be valid html.
|
|
43
40
|
mime_attr = f'type="{mime}"' if mime is not None else ''
|
|
@@ -45,7 +42,6 @@ def _create_source_tag(file_path: str) -> str:
|
|
|
45
42
|
|
|
46
43
|
|
|
47
44
|
class DataFrameResultSet:
|
|
48
|
-
|
|
49
45
|
def __init__(self, rows: List[List[Any]], col_names: List[str], col_types: List[ColumnType]):
|
|
50
46
|
self._rows = rows
|
|
51
47
|
self._col_names = col_names
|
|
@@ -54,6 +50,7 @@ class DataFrameResultSet:
|
|
|
54
50
|
ts.ImageType: self._format_img,
|
|
55
51
|
ts.VideoType: self._format_video,
|
|
56
52
|
ts.AudioType: self._format_audio,
|
|
53
|
+
ts.DocumentType: self._format_document,
|
|
57
54
|
}
|
|
58
55
|
|
|
59
56
|
def __len__(self) -> int:
|
|
@@ -90,7 +87,6 @@ class DataFrameResultSet:
|
|
|
90
87
|
return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
|
|
91
88
|
|
|
92
89
|
# Formatters
|
|
93
|
-
|
|
94
90
|
def _format_img(self, img: Image.Image) -> str:
|
|
95
91
|
"""
|
|
96
92
|
Create <img> tag for Image object.
|
|
@@ -106,14 +102,14 @@ class DataFrameResultSet:
|
|
|
106
102
|
with io.BytesIO() as buffer:
|
|
107
103
|
img.save(buffer, 'jpeg')
|
|
108
104
|
img_base64 = base64.b64encode(buffer.getvalue()).decode()
|
|
109
|
-
return f
|
|
110
|
-
<div style="width:{width}px;">
|
|
105
|
+
return f"""
|
|
106
|
+
<div class="pxt_image" style="width:{width}px;">
|
|
111
107
|
<img src="data:image/jpeg;base64,{img_base64}" width="{width}" />
|
|
112
108
|
</div>
|
|
113
|
-
|
|
109
|
+
"""
|
|
114
110
|
|
|
115
111
|
def _format_video(self, file_path: str) -> str:
|
|
116
|
-
thumb_tag =
|
|
112
|
+
thumb_tag = ''
|
|
117
113
|
# Attempt to extract the first frame of the video to use as a thumbnail,
|
|
118
114
|
# so that the notebook can be exported as HTML and viewed in contexts where
|
|
119
115
|
# the video itself is not accessible.
|
|
@@ -136,16 +132,53 @@ class DataFrameResultSet:
|
|
|
136
132
|
width = 480
|
|
137
133
|
else:
|
|
138
134
|
width = 800
|
|
139
|
-
return f
|
|
140
|
-
<div style="width:{width}px;">
|
|
135
|
+
return f"""
|
|
136
|
+
<div class="pxt_video" style="width:{width}px;">
|
|
141
137
|
<video controls width="{width}" {thumb_tag}>
|
|
142
138
|
{_create_source_tag(file_path)}
|
|
143
139
|
</video>
|
|
144
140
|
</div>
|
|
145
|
-
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
def _format_document(self, file_path: str) -> str:
|
|
144
|
+
max_width = max_height = 320
|
|
145
|
+
# by default, file path will be shown as a link
|
|
146
|
+
inner_element = file_path
|
|
147
|
+
# try generating a thumbnail for different types and use that if successful
|
|
148
|
+
if file_path.lower().endswith('.pdf'):
|
|
149
|
+
try:
|
|
150
|
+
import fitz
|
|
151
|
+
|
|
152
|
+
doc = fitz.open(file_path)
|
|
153
|
+
p = doc.get_page_pixmap(0)
|
|
154
|
+
while p.width > max_width or p.height > max_height:
|
|
155
|
+
# shrink(1) will halve each dimension
|
|
156
|
+
p.shrink(1)
|
|
157
|
+
data = p.tobytes(output='jpeg')
|
|
158
|
+
thumb_base64 = base64.b64encode(data).decode()
|
|
159
|
+
img_src = f'data:image/jpeg;base64,{thumb_base64}'
|
|
160
|
+
inner_element = f"""
|
|
161
|
+
<img style="object-fit: contain; border: 1px solid black;" src="{img_src}" />
|
|
162
|
+
"""
|
|
163
|
+
except:
|
|
164
|
+
logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
|
|
165
|
+
|
|
166
|
+
return f"""
|
|
167
|
+
<div class="pxt_document" style="width:{max_width}px;">
|
|
168
|
+
<a href="{get_file_uri(Env.get().http_address, file_path)}">
|
|
169
|
+
{inner_element}
|
|
170
|
+
</a>
|
|
171
|
+
</div>
|
|
172
|
+
"""
|
|
146
173
|
|
|
147
174
|
def _format_audio(self, file_path: str) -> str:
|
|
148
|
-
return f
|
|
175
|
+
return f"""
|
|
176
|
+
<div class="pxt_audio">
|
|
177
|
+
<audio controls>
|
|
178
|
+
{_create_source_tag(file_path)}
|
|
179
|
+
</audio>
|
|
180
|
+
</div>
|
|
181
|
+
"""
|
|
149
182
|
|
|
150
183
|
def __getitem__(self, index: Any) -> Any:
|
|
151
184
|
if isinstance(index, str):
|
|
@@ -186,51 +219,53 @@ class DataFrameResultSetIterator:
|
|
|
186
219
|
return row
|
|
187
220
|
|
|
188
221
|
|
|
189
|
-
# TODO: remove this; it's only here as a reminder that we still need to call release() in the current implementation
|
|
190
|
-
class AnalysisInfo:
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
222
|
+
# # TODO: remove this; it's only here as a reminder that we still need to call release() in the current implementation
|
|
223
|
+
# class AnalysisInfo:
|
|
224
|
+
# def __init__(self, tbl: catalog.TableVersion):
|
|
225
|
+
# self.tbl = tbl
|
|
226
|
+
# # output of the SQL scan stage
|
|
227
|
+
# self.sql_scan_output_exprs: List[exprs.Expr] = []
|
|
228
|
+
# # output of the agg stage
|
|
229
|
+
# self.agg_output_exprs: List[exprs.Expr] = []
|
|
230
|
+
# # Where clause of the Select stmt of the SQL scan stage
|
|
231
|
+
# self.sql_where_clause: Optional[sql.ClauseElement] = None
|
|
232
|
+
# # filter predicate applied to input rows of the SQL scan stage
|
|
233
|
+
# self.filter: Optional[exprs.Predicate] = None
|
|
234
|
+
# self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
|
|
235
|
+
# self.agg_fn_calls: List[exprs.FunctionCall] = [] # derived from unique_exprs
|
|
236
|
+
# self.has_frame_col: bool = False # True if we're referencing the frame col
|
|
237
|
+
#
|
|
238
|
+
# self.evaluator: Optional[exprs.Evaluator] = None
|
|
239
|
+
# self.sql_scan_eval_ctx: List[exprs.Expr] = [] # needed to materialize output of SQL scan stage
|
|
240
|
+
# self.agg_eval_ctx: List[exprs.Expr] = [] # needed to materialize output of agg stage
|
|
241
|
+
# self.filter_eval_ctx: List[exprs.Expr] = []
|
|
242
|
+
# self.group_by_eval_ctx: List[exprs.Expr] = []
|
|
243
|
+
#
|
|
244
|
+
# def finalize_exec(self) -> None:
|
|
245
|
+
# """
|
|
246
|
+
# Call release() on all collected Exprs.
|
|
247
|
+
# """
|
|
248
|
+
# exprs.Expr.release_list(self.sql_scan_output_exprs)
|
|
249
|
+
# exprs.Expr.release_list(self.agg_output_exprs)
|
|
250
|
+
# if self.filter is not None:
|
|
251
|
+
# self.filter.release()
|
|
219
252
|
|
|
220
253
|
|
|
221
254
|
class DataFrame:
|
|
222
255
|
def __init__(
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
256
|
+
self,
|
|
257
|
+
tbl: catalog.TableVersionPath,
|
|
258
|
+
select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]] = None,
|
|
259
|
+
where_clause: Optional[exprs.Predicate] = None,
|
|
260
|
+
group_by_clause: Optional[List[exprs.Expr]] = None,
|
|
261
|
+
grouping_tbl: Optional[catalog.TableVersion] = None,
|
|
262
|
+
order_by_clause: Optional[List[Tuple[exprs.Expr, bool]]] = None, # List[(expr, asc)]
|
|
263
|
+
limit: Optional[int] = None,
|
|
264
|
+
):
|
|
230
265
|
self.tbl = tbl
|
|
231
266
|
|
|
232
267
|
# select list logic
|
|
233
|
-
DataFrame._select_list_check_rep(select_list)
|
|
268
|
+
DataFrame._select_list_check_rep(select_list) # check select list without expansion
|
|
234
269
|
# exprs contain execution state and therefore cannot be shared
|
|
235
270
|
select_list = copy.deepcopy(select_list)
|
|
236
271
|
select_list_exprs, column_names = DataFrame._normalize_select_list(tbl, select_list)
|
|
@@ -249,12 +284,12 @@ class DataFrame:
|
|
|
249
284
|
self.limit_val = limit
|
|
250
285
|
|
|
251
286
|
@classmethod
|
|
252
|
-
def _select_list_check_rep(
|
|
287
|
+
def _select_list_check_rep(
|
|
288
|
+
cls,
|
|
253
289
|
select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]],
|
|
254
290
|
) -> None:
|
|
255
|
-
"""Validate basic select list types.
|
|
256
|
-
|
|
257
|
-
if select_list is None: # basic check for valid select list
|
|
291
|
+
"""Validate basic select list types."""
|
|
292
|
+
if select_list is None: # basic check for valid select list
|
|
258
293
|
return
|
|
259
294
|
|
|
260
295
|
assert len(select_list) > 0
|
|
@@ -267,13 +302,14 @@ class DataFrame:
|
|
|
267
302
|
assert is_valid_identifier(ent[1])
|
|
268
303
|
|
|
269
304
|
@classmethod
|
|
270
|
-
def _normalize_select_list(
|
|
305
|
+
def _normalize_select_list(
|
|
306
|
+
cls,
|
|
271
307
|
tbl: catalog.TableVersionPath,
|
|
272
308
|
select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]],
|
|
273
309
|
) -> Tuple[List[exprs.Expr], List[str]]:
|
|
274
310
|
"""
|
|
275
311
|
Expand select list information with all columns and their names
|
|
276
|
-
Returns:
|
|
312
|
+
Returns:
|
|
277
313
|
a pair composed of the list of expressions and the list of corresponding names
|
|
278
314
|
"""
|
|
279
315
|
if select_list is None:
|
|
@@ -281,9 +317,9 @@ class DataFrame:
|
|
|
281
317
|
else:
|
|
282
318
|
expanded_list = select_list
|
|
283
319
|
|
|
284
|
-
out_exprs
|
|
285
|
-
out_names
|
|
286
|
-
seen_out_names
|
|
320
|
+
out_exprs: List[exprs.Expr] = []
|
|
321
|
+
out_names: List[str] = [] # keep track of order
|
|
322
|
+
seen_out_names: set[str] = set() # use to check for duplicates in loop, avoid square complexity
|
|
287
323
|
for i, (expr, name) in enumerate(expanded_list):
|
|
288
324
|
if name is None:
|
|
289
325
|
# use default, add suffix if needed so default adds no duplicates
|
|
@@ -292,13 +328,13 @@ class DataFrame:
|
|
|
292
328
|
column_name = default_name
|
|
293
329
|
if default_name in seen_out_names:
|
|
294
330
|
# already used, then add suffix until unique name is found
|
|
295
|
-
for j in range(1, len(out_names)+1):
|
|
331
|
+
for j in range(1, len(out_names) + 1):
|
|
296
332
|
column_name = f'{default_name}_{j}'
|
|
297
333
|
if column_name not in seen_out_names:
|
|
298
334
|
break
|
|
299
|
-
else:
|
|
335
|
+
else: # no default name, eg some expressions
|
|
300
336
|
column_name = f'col_{i}'
|
|
301
|
-
else:
|
|
337
|
+
else: # user provided name, no attempt to rename
|
|
302
338
|
column_name = name
|
|
303
339
|
|
|
304
340
|
out_exprs.append(expr)
|
|
@@ -326,9 +362,13 @@ class DataFrame:
|
|
|
326
362
|
for item in self._select_list_exprs:
|
|
327
363
|
item.bind_rel_paths(None)
|
|
328
364
|
plan = Planner.create_query_plan(
|
|
329
|
-
self.tbl,
|
|
365
|
+
self.tbl,
|
|
366
|
+
self._select_list_exprs,
|
|
367
|
+
where_clause=self.where_clause,
|
|
368
|
+
group_by_clause=group_by_clause,
|
|
330
369
|
order_by_clause=self.order_by_clause if self.order_by_clause is not None else [],
|
|
331
|
-
limit=self.limit_val if self.limit_val is not None else 0
|
|
370
|
+
limit=self.limit_val if self.limit_val is not None else 0,
|
|
371
|
+
) # limit_val == 0: no limit_val
|
|
332
372
|
|
|
333
373
|
with Env.get().engine.begin() as conn:
|
|
334
374
|
plan.ctx.conn = conn
|
|
@@ -374,12 +414,10 @@ class DataFrame:
|
|
|
374
414
|
result_row = [data_row[e.slot_idx] for e in self._select_list_exprs]
|
|
375
415
|
result_rows.append(result_row)
|
|
376
416
|
except excs.ExprEvalError as e:
|
|
377
|
-
msg =
|
|
378
|
-
f'{type(e.exc).__name__}:\n{str(e.exc)}')
|
|
417
|
+
msg = f'In row {e.row_num} the {e.expr_msg} encountered exception ' f'{type(e.exc).__name__}:\n{str(e.exc)}'
|
|
379
418
|
if len(e.input_vals) > 0:
|
|
380
419
|
input_msgs = [
|
|
381
|
-
f"'{d}' = {d.col_type.print_value(e.input_vals[i])}"
|
|
382
|
-
for i, d in enumerate(e.expr.dependencies())
|
|
420
|
+
f"'{d}' = {d.col_type.print_value(e.input_vals[i])}" for i, d in enumerate(e.expr.dependencies())
|
|
383
421
|
]
|
|
384
422
|
msg += f'\nwith {", ".join(input_msgs)}'
|
|
385
423
|
assert e.exc_tb is not None
|
|
@@ -399,6 +437,7 @@ class DataFrame:
|
|
|
399
437
|
|
|
400
438
|
def count(self) -> int:
|
|
401
439
|
from pixeltable.plan import Planner
|
|
440
|
+
|
|
402
441
|
stmt = Planner.create_count_stmt(self.tbl, self.where_clause)
|
|
403
442
|
with Env.get().engine.connect() as conn:
|
|
404
443
|
result: int = conn.execute(stmt).scalar_one()
|
|
@@ -424,9 +463,9 @@ class DataFrame:
|
|
|
424
463
|
if self.order_by_clause is not None:
|
|
425
464
|
heading_vals.append('Order By')
|
|
426
465
|
heading_vals.extend([''] * (len(self.order_by_clause) - 1))
|
|
427
|
-
info_vals.extend(
|
|
428
|
-
f'{e[0].display_str(inline=False)} {"asc" if e[1] else "desc"}' for e in self.order_by_clause
|
|
429
|
-
|
|
466
|
+
info_vals.extend(
|
|
467
|
+
[f'{e[0].display_str(inline=False)} {"asc" if e[1] else "desc"}' for e in self.order_by_clause]
|
|
468
|
+
)
|
|
430
469
|
if self.limit_val is not None:
|
|
431
470
|
heading_vals.append('Limit')
|
|
432
471
|
info_vals.append(str(self.limit_val))
|
|
@@ -440,9 +479,12 @@ class DataFrame:
|
|
|
440
479
|
pd_df = self._description()
|
|
441
480
|
# white-space: pre-wrap: print \n as newline
|
|
442
481
|
# th: center-align headings
|
|
443
|
-
return
|
|
444
|
-
.
|
|
445
|
-
.
|
|
482
|
+
return (
|
|
483
|
+
pd_df.style.set_properties(**{'white-space': 'pre-wrap', 'text-align': 'left'})
|
|
484
|
+
.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
|
|
485
|
+
.hide(axis='index')
|
|
486
|
+
.hide(axis='columns')
|
|
487
|
+
)
|
|
446
488
|
|
|
447
489
|
def describe(self) -> None:
|
|
448
490
|
"""
|
|
@@ -453,6 +495,7 @@ class DataFrame:
|
|
|
453
495
|
try:
|
|
454
496
|
__IPYTHON__
|
|
455
497
|
from IPython.display import display
|
|
498
|
+
|
|
456
499
|
display(self._description_html())
|
|
457
500
|
except NameError:
|
|
458
501
|
print(self.__repr__())
|
|
@@ -463,16 +506,16 @@ class DataFrame:
|
|
|
463
506
|
def _repr_html_(self) -> str:
|
|
464
507
|
return self._description_html()._repr_html_()
|
|
465
508
|
|
|
466
|
-
def select(self, *items: Any, **named_items
|
|
509
|
+
def select(self, *items: Any, **named_items: Any) -> DataFrame:
|
|
467
510
|
if self.select_list is not None:
|
|
468
511
|
raise excs.Error(f'Select list already specified')
|
|
469
|
-
for
|
|
512
|
+
for name, _ in named_items.items():
|
|
470
513
|
if not isinstance(name, str) or not is_valid_identifier(name):
|
|
471
514
|
raise excs.Error(f'Invalid name: {name}')
|
|
472
515
|
base_list = [(expr, None) for expr in items] + [(expr, k) for (k, expr) in named_items.items()]
|
|
473
516
|
if len(base_list) == 0:
|
|
474
517
|
raise excs.Error(f'Empty select list')
|
|
475
|
-
|
|
518
|
+
|
|
476
519
|
# analyze select list; wrap literals with the corresponding expressions
|
|
477
520
|
select_list = []
|
|
478
521
|
for raw_expr, name in base_list:
|
|
@@ -501,13 +544,25 @@ class DataFrame:
|
|
|
501
544
|
seen.add(name)
|
|
502
545
|
|
|
503
546
|
return DataFrame(
|
|
504
|
-
self.tbl,
|
|
505
|
-
|
|
547
|
+
self.tbl,
|
|
548
|
+
select_list=select_list,
|
|
549
|
+
where_clause=self.where_clause,
|
|
550
|
+
group_by_clause=self.group_by_clause,
|
|
551
|
+
grouping_tbl=self.grouping_tbl,
|
|
552
|
+
order_by_clause=self.order_by_clause,
|
|
553
|
+
limit=self.limit_val,
|
|
554
|
+
)
|
|
506
555
|
|
|
507
556
|
def where(self, pred: exprs.Predicate) -> DataFrame:
|
|
508
557
|
return DataFrame(
|
|
509
|
-
self.tbl,
|
|
510
|
-
|
|
558
|
+
self.tbl,
|
|
559
|
+
select_list=self.select_list,
|
|
560
|
+
where_clause=pred,
|
|
561
|
+
group_by_clause=self.group_by_clause,
|
|
562
|
+
grouping_tbl=self.grouping_tbl,
|
|
563
|
+
order_by_clause=self.order_by_clause,
|
|
564
|
+
limit=self.limit_val,
|
|
565
|
+
)
|
|
511
566
|
|
|
512
567
|
def group_by(self, *grouping_items: Any) -> DataFrame:
|
|
513
568
|
"""Add a group-by clause to this DataFrame.
|
|
@@ -534,8 +589,14 @@ class DataFrame:
|
|
|
534
589
|
if grouping_tbl is None:
|
|
535
590
|
group_by_clause = list(grouping_items)
|
|
536
591
|
return DataFrame(
|
|
537
|
-
self.tbl,
|
|
538
|
-
|
|
592
|
+
self.tbl,
|
|
593
|
+
select_list=self.select_list,
|
|
594
|
+
where_clause=self.where_clause,
|
|
595
|
+
group_by_clause=group_by_clause,
|
|
596
|
+
grouping_tbl=grouping_tbl,
|
|
597
|
+
order_by_clause=self.order_by_clause,
|
|
598
|
+
limit=self.limit_val,
|
|
599
|
+
)
|
|
539
600
|
|
|
540
601
|
def order_by(self, *expr_list: exprs.Expr, asc: bool = True) -> DataFrame:
|
|
541
602
|
for e in expr_list:
|
|
@@ -544,16 +605,26 @@ class DataFrame:
|
|
|
544
605
|
order_by_clause = self.order_by_clause if self.order_by_clause is not None else []
|
|
545
606
|
order_by_clause.extend([(e.copy(), asc) for e in expr_list])
|
|
546
607
|
return DataFrame(
|
|
547
|
-
self.tbl,
|
|
548
|
-
|
|
549
|
-
|
|
608
|
+
self.tbl,
|
|
609
|
+
select_list=self.select_list,
|
|
610
|
+
where_clause=self.where_clause,
|
|
611
|
+
group_by_clause=self.group_by_clause,
|
|
612
|
+
grouping_tbl=self.grouping_tbl,
|
|
613
|
+
order_by_clause=order_by_clause,
|
|
614
|
+
limit=self.limit_val,
|
|
615
|
+
)
|
|
550
616
|
|
|
551
617
|
def limit(self, n: int) -> DataFrame:
|
|
552
618
|
assert n is not None and isinstance(n, int)
|
|
553
619
|
return DataFrame(
|
|
554
|
-
self.tbl,
|
|
555
|
-
|
|
556
|
-
|
|
620
|
+
self.tbl,
|
|
621
|
+
select_list=self.select_list,
|
|
622
|
+
where_clause=self.where_clause,
|
|
623
|
+
group_by_clause=self.group_by_clause,
|
|
624
|
+
grouping_tbl=self.grouping_tbl,
|
|
625
|
+
order_by_clause=self.order_by_clause,
|
|
626
|
+
limit=n,
|
|
627
|
+
)
|
|
557
628
|
|
|
558
629
|
def __getitem__(self, index: object) -> DataFrame:
|
|
559
630
|
"""
|
|
@@ -571,24 +642,27 @@ class DataFrame:
|
|
|
571
642
|
if isinstance(index, list):
|
|
572
643
|
return self.select(*index)
|
|
573
644
|
raise TypeError(f'Invalid index type: {type(index)}')
|
|
574
|
-
|
|
645
|
+
|
|
575
646
|
def _as_dict(self) -> Dict[str, Any]:
|
|
576
|
-
"""
|
|
577
|
-
|
|
578
|
-
|
|
647
|
+
"""
|
|
648
|
+
Returns:
|
|
649
|
+
Dictionary representing this dataframe.
|
|
579
650
|
"""
|
|
580
651
|
tbl_versions = self.tbl.get_tbl_versions()
|
|
581
652
|
d = {
|
|
582
653
|
'_classname': 'DataFrame',
|
|
583
654
|
'tbl_ids': [str(t.id) for t in tbl_versions],
|
|
584
655
|
'tbl_versions': [t.version for t in tbl_versions],
|
|
585
|
-
'select_list':
|
|
586
|
-
|
|
656
|
+
'select_list': [(e.as_dict(), name) for (e, name) in self.select_list]
|
|
657
|
+
if self.select_list is not None
|
|
658
|
+
else None,
|
|
587
659
|
'where_clause': self.where_clause.as_dict() if self.where_clause is not None else None,
|
|
588
|
-
'group_by_clause':
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
660
|
+
'group_by_clause': [e.as_dict() for e in self.group_by_clause]
|
|
661
|
+
if self.group_by_clause is not None
|
|
662
|
+
else None,
|
|
663
|
+
'order_by_clause': [(e.as_dict(), asc) for (e, asc) in self.order_by_clause]
|
|
664
|
+
if self.order_by_clause is not None
|
|
665
|
+
else None,
|
|
592
666
|
'limit_val': self.limit_val,
|
|
593
667
|
}
|
|
594
668
|
return d
|
|
@@ -615,7 +689,7 @@ class DataFrame:
|
|
|
615
689
|
summary_string = json.dumps(self._as_dict())
|
|
616
690
|
cache_key = hashlib.sha256(summary_string.encode()).hexdigest()
|
|
617
691
|
|
|
618
|
-
dest_path =
|
|
692
|
+
dest_path = Env.get().dataset_cache_dir / f'coco_{cache_key}'
|
|
619
693
|
if dest_path.exists():
|
|
620
694
|
assert dest_path.is_dir()
|
|
621
695
|
data_file_path = dest_path / 'data.json'
|
|
@@ -660,14 +734,14 @@ class DataFrame:
|
|
|
660
734
|
Env.get().require_package('torch')
|
|
661
735
|
Env.get().require_package('torchvision')
|
|
662
736
|
|
|
663
|
-
from pixeltable.
|
|
664
|
-
from pixeltable.utils.pytorch import PixeltablePytorchDataset
|
|
737
|
+
from pixeltable.io.parquet import save_parquet # pylint: disable=import-outside-toplevel
|
|
738
|
+
from pixeltable.utils.pytorch import PixeltablePytorchDataset # pylint: disable=import-outside-toplevel
|
|
665
739
|
|
|
666
|
-
summary_string = json.dumps(self._as_dict())
|
|
740
|
+
summary_string = json.dumps(self._as_dict())
|
|
667
741
|
cache_key = hashlib.sha256(summary_string.encode()).hexdigest()
|
|
668
|
-
|
|
669
|
-
dest_path = (Env.get().dataset_cache_dir / f'df_{cache_key}').with_suffix('.parquet')
|
|
670
|
-
if dest_path.exists():
|
|
742
|
+
|
|
743
|
+
dest_path = (Env.get().dataset_cache_dir / f'df_{cache_key}').with_suffix('.parquet') # pylint: disable = protected-access
|
|
744
|
+
if dest_path.exists(): # fast path: use cache
|
|
671
745
|
assert dest_path.is_dir()
|
|
672
746
|
else:
|
|
673
747
|
save_parquet(self, dest_path)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .remote import Remote
|