pixeltable 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +21 -4
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +200 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +191 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +581 -0
- pixeltable/catalog/table_version.py +749 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/client.py +520 -31
- pixeltable/dataframe.py +540 -349
- pixeltable/env.py +373 -48
- pixeltable/exceptions.py +12 -21
- pixeltable/exec/__init__.py +9 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +113 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +95 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +69 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +225 -0
- pixeltable/exprs/__init__.py +24 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +105 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +187 -0
- pixeltable/exprs/expr.py +586 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +380 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +115 -0
- pixeltable/exprs/image_similarity_predicate.py +58 -0
- pixeltable/exprs/inline_array.py +107 -0
- pixeltable/exprs/inline_dict.py +101 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +54 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +355 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/func/__init__.py +9 -0
- pixeltable/func/aggregate_function.py +194 -0
- pixeltable/func/batched_function.py +53 -0
- pixeltable/func/callable_function.py +69 -0
- pixeltable/func/expr_template_function.py +82 -0
- pixeltable/func/function.py +110 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +36 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +166 -0
- pixeltable/func/udf.py +163 -0
- pixeltable/functions/__init__.py +52 -103
- pixeltable/functions/eval.py +216 -0
- pixeltable/functions/fireworks.py +61 -0
- pixeltable/functions/huggingface.py +120 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +88 -0
- pixeltable/functions/pil/image.py +148 -7
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +27 -0
- pixeltable/functions/util.py +41 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +48 -0
- pixeltable/iterators/document.py +311 -0
- pixeltable/iterators/video.py +89 -0
- pixeltable/metadata/__init__.py +54 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/schema.py +211 -0
- pixeltable/plan.py +656 -0
- pixeltable/store.py +413 -182
- pixeltable/tests/conftest.py +143 -86
- pixeltable/tests/test_audio.py +65 -0
- pixeltable/tests/test_catalog.py +27 -0
- pixeltable/tests/test_client.py +14 -14
- pixeltable/tests/test_component_view.py +372 -0
- pixeltable/tests/test_dataframe.py +433 -0
- pixeltable/tests/test_dirs.py +78 -62
- pixeltable/tests/test_document.py +117 -0
- pixeltable/tests/test_exprs.py +591 -135
- pixeltable/tests/test_function.py +297 -67
- pixeltable/tests/test_functions.py +283 -1
- pixeltable/tests/test_migration.py +43 -0
- pixeltable/tests/test_nos.py +54 -0
- pixeltable/tests/test_snapshot.py +208 -0
- pixeltable/tests/test_table.py +1086 -258
- pixeltable/tests/test_transactional_directory.py +42 -0
- pixeltable/tests/test_types.py +5 -11
- pixeltable/tests/test_video.py +149 -34
- pixeltable/tests/test_view.py +530 -0
- pixeltable/tests/utils.py +186 -45
- pixeltable/tool/create_test_db_dump.py +149 -0
- pixeltable/type_system.py +490 -133
- pixeltable/utils/__init__.py +17 -46
- pixeltable/utils/clip.py +12 -15
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +39 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/parquet.py +126 -0
- pixeltable/utils/pytorch.py +172 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.2.1.dist-info/LICENSE +18 -0
- pixeltable-0.2.1.dist-info/METADATA +119 -0
- pixeltable-0.2.1.dist-info/RECORD +125 -0
- {pixeltable-0.1.2.dist-info → pixeltable-0.2.1.dist-info}/WHEEL +1 -1
- pixeltable/catalog.py +0 -1421
- pixeltable/exprs.py +0 -1745
- pixeltable/function.py +0 -269
- pixeltable/functions/clip.py +0 -10
- pixeltable/functions/pil/__init__.py +0 -23
- pixeltable/functions/tf.py +0 -21
- pixeltable/index.py +0 -57
- pixeltable/tests/test_dict.py +0 -24
- pixeltable/tests/test_tf.py +0 -69
- pixeltable/tf.py +0 -33
- pixeltable/utils/tf.py +0 -33
- pixeltable/utils/video.py +0 -32
- pixeltable-0.1.2.dist-info/LICENSE +0 -201
- pixeltable-0.1.2.dist-info/METADATA +0 -89
- pixeltable-0.1.2.dist-info/RECORD +0 -37
pixeltable/utils/__init__.py
CHANGED
|
@@ -1,46 +1,17 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
return
|
|
12
|
-
|
|
13
|
-
return
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
def get_extracted_frame_path(tbl_id: int, video_col_id: int, version: int, offset: int) -> str:
|
|
19
|
-
return Env.get().img_dir / f'frame_{tbl_id}_{video_col_id}_{version}_{offset}'
|
|
20
|
-
|
|
21
|
-
def computed_imgs(
|
|
22
|
-
tbl_id: Optional[int] = None, col_id: Optional[int] = None, version: Optional[int] = None) -> int:
|
|
23
|
-
path = f'{Env.get().img_dir}/img_'
|
|
24
|
-
path += f'{tbl_id}_' if tbl_id is not None else '*_'
|
|
25
|
-
path += f'{col_id}_' if col_id is not None else '*_'
|
|
26
|
-
path += f'{version}_' if version is not None else '*_'
|
|
27
|
-
path += '*'
|
|
28
|
-
names = glob.glob(path)
|
|
29
|
-
return names
|
|
30
|
-
|
|
31
|
-
def computed_img_count(
|
|
32
|
-
tbl_id: Optional[int] = None, col_id: Optional[int] = None, version: Optional[int] = None) -> int:
|
|
33
|
-
return len(computed_imgs(tbl_id=tbl_id, col_id=col_id, version=version))
|
|
34
|
-
|
|
35
|
-
def extracted_frames(tbl_id: Optional[int] = None, version: Optional[int] = None) -> int:
|
|
36
|
-
path = f'{Env.get().img_dir}/frame_'
|
|
37
|
-
path += f'{tbl_id}_' if tbl_id is not None else '*_'
|
|
38
|
-
path += '*_' # video_col_id
|
|
39
|
-
path += f'{version}_' if version is not None else '*_'
|
|
40
|
-
path += '*_' # offset
|
|
41
|
-
path += '*' # running frame index
|
|
42
|
-
names = glob.glob(path)
|
|
43
|
-
return names
|
|
44
|
-
|
|
45
|
-
def extracted_frame_count(tbl_id: Optional[int] = None, version: Optional[int] = None) -> int:
|
|
46
|
-
return len(extracted_frames(tbl_id, version))
|
|
1
|
+
def print_perf_counter_delta(delta: float) -> str:
|
|
2
|
+
"""Prints a performance counter delta in a human-readable format.
|
|
3
|
+
|
|
4
|
+
Args:
|
|
5
|
+
delta: delta in seconds
|
|
6
|
+
|
|
7
|
+
Returns:
|
|
8
|
+
Human-readable string
|
|
9
|
+
"""
|
|
10
|
+
if delta < 1e-6:
|
|
11
|
+
return f'{delta * 1e9:.2f} ns'
|
|
12
|
+
elif delta < 1e-3:
|
|
13
|
+
return f'{delta * 1e6:.2f} us'
|
|
14
|
+
elif delta < 1:
|
|
15
|
+
return f'{delta * 1e3:.2f} ms'
|
|
16
|
+
else:
|
|
17
|
+
return f'{delta:.2f} s'
|
pixeltable/utils/clip.py
CHANGED
|
@@ -1,21 +1,18 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
import PIL.Image
|
|
3
|
-
import clip
|
|
4
|
-
import torch
|
|
5
|
-
import PIL.Image
|
|
6
3
|
|
|
4
|
+
import pixeltable.func as func
|
|
5
|
+
from pixeltable.env import Env
|
|
7
6
|
|
|
8
|
-
_device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
9
|
-
_model, _preprocess = clip.load("ViT-B/32", device=_device)
|
|
10
7
|
|
|
11
|
-
def
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
return
|
|
8
|
+
def embed_image(img: PIL.Image.Image) -> np.ndarray:
|
|
9
|
+
from pixeltable.functions.nos.image_embedding import openai_clip
|
|
10
|
+
model_info = openai_clip.model_spec
|
|
11
|
+
result = Env.get().nos_client.Run(task=model_info.task, model_name=model_info.name, images=[img.resize((224, 224))])
|
|
12
|
+
return result['embedding'].squeeze(0)
|
|
16
13
|
|
|
17
|
-
def
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
return
|
|
14
|
+
def embed_text(text: str) -> np.ndarray:
|
|
15
|
+
from pixeltable.functions.nos.text_embedding import openai_clip
|
|
16
|
+
model_info = openai_clip.model_spec
|
|
17
|
+
result = Env.get().nos_client.Run(task=model_info.task, model_name=model_info.name, texts=[text])
|
|
18
|
+
return result['embedding'].squeeze(0)
|
pixeltable/utils/coco.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Set
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
import PIL
|
|
6
|
+
|
|
7
|
+
import pixeltable.exceptions as excs
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
format_msg = """
|
|
11
|
+
|
|
12
|
+
Required format:
|
|
13
|
+
{
|
|
14
|
+
'image': PIL.Image.Image,
|
|
15
|
+
'annotations': [
|
|
16
|
+
{
|
|
17
|
+
'bbox': [x: int, y: int, w: int, h: int],
|
|
18
|
+
'category': str | int,
|
|
19
|
+
},
|
|
20
|
+
...
|
|
21
|
+
],
|
|
22
|
+
}
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def _verify_input_dict(input_dict: Dict[str, Any]) -> None:
|
|
26
|
+
"""Verify that input_dict is a valid input dict for write_coco_dataset()"""
|
|
27
|
+
if not isinstance(input_dict, dict):
|
|
28
|
+
raise excs.Error(f'Expected dict, got {input_dict}{format_msg}')
|
|
29
|
+
if 'image' not in input_dict:
|
|
30
|
+
raise excs.Error(f'Missing key "image" in input dict: {input_dict}{format_msg}')
|
|
31
|
+
if not isinstance(input_dict['image'], PIL.Image.Image):
|
|
32
|
+
raise excs.Error(f'Value for "image" is not a PIL.Image.Image: {input_dict}{format_msg}')
|
|
33
|
+
if 'annotations' not in input_dict:
|
|
34
|
+
raise excs.Error(f'Missing key "annotations" in input dict: {input_dict}{format_msg}')
|
|
35
|
+
if not isinstance(input_dict['annotations'], list):
|
|
36
|
+
raise excs.Error(f'Value for "annotations" is not a list: {input_dict}{format_msg}')
|
|
37
|
+
for annotation in input_dict['annotations']:
|
|
38
|
+
if not isinstance(annotation, dict):
|
|
39
|
+
raise excs.Error(f'Annotation is not a dict: {annotation}{format_msg}')
|
|
40
|
+
if 'bbox' not in annotation:
|
|
41
|
+
raise excs.Error(f'Missing key "bbox" in annotation: {annotation}{format_msg}')
|
|
42
|
+
if not isinstance(annotation['bbox'], list):
|
|
43
|
+
raise excs.Error(f'Value for "bbox" is not a list [x, y, w, h]: {annotation}{format_msg}')
|
|
44
|
+
if len(annotation['bbox']) != 4 or not all(isinstance(x, int) for x in annotation['bbox']):
|
|
45
|
+
raise excs.Error(f'Key "bbox" is not a list [x, y, w, h] of ints: {annotation}{format_msg}')
|
|
46
|
+
if 'category' not in annotation:
|
|
47
|
+
raise excs.Error(f'Missing key "category" in annotation: {annotation}{format_msg}')
|
|
48
|
+
if not isinstance(annotation['category'], (str, int)):
|
|
49
|
+
raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
|
|
50
|
+
|
|
51
|
+
def write_coco_dataset(df: 'pixeltable.DataFrame', dest_path: Path) -> Path:
|
|
52
|
+
"""Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
|
|
53
|
+
# TODO: validate schema
|
|
54
|
+
if len(df._select_list_exprs) != 1 or not df._select_list_exprs[0].col_type.is_json_type():
|
|
55
|
+
raise excs.Error(f'Expected exactly one json-typed column in select list: {df._select_list_exprs}')
|
|
56
|
+
input_dict_slot_idx = -1 # df._select_list_exprs[0].slot_idx isn't valid until _exec()
|
|
57
|
+
|
|
58
|
+
# create output dir
|
|
59
|
+
assert not dest_path.exists()
|
|
60
|
+
dest_path.mkdir(parents=False)
|
|
61
|
+
images_dir = dest_path / 'images'
|
|
62
|
+
images_dir.mkdir()
|
|
63
|
+
|
|
64
|
+
images: List[Dict[str, Any]] = []
|
|
65
|
+
img_id = -1
|
|
66
|
+
annotations: List[Dict[str, Any]] = []
|
|
67
|
+
ann_id = -1
|
|
68
|
+
categories: Set[Any] = set()
|
|
69
|
+
for input_row in df._exec():
|
|
70
|
+
if input_dict_slot_idx == -1:
|
|
71
|
+
input_dict_expr = df._select_list_exprs[0]
|
|
72
|
+
input_dict_slot_idx = input_dict_expr.slot_idx
|
|
73
|
+
input_dict = input_row[input_dict_slot_idx]
|
|
74
|
+
_verify_input_dict(input_dict)
|
|
75
|
+
|
|
76
|
+
# we want to know the slot idx of the image used in the input dict, so that we can check whether we
|
|
77
|
+
# already have a local path for it
|
|
78
|
+
input_dict_dependencies = input_dict_expr.dependencies()
|
|
79
|
+
img_slot_idx = next((e.slot_idx for e in input_dict_dependencies if e.col_type.is_image_type()), None)
|
|
80
|
+
assert img_slot_idx is not None
|
|
81
|
+
else:
|
|
82
|
+
input_dict = input_row[input_dict_slot_idx]
|
|
83
|
+
_verify_input_dict(input_dict)
|
|
84
|
+
|
|
85
|
+
# create image record
|
|
86
|
+
img_id += 1
|
|
87
|
+
|
|
88
|
+
# get a local path for the image
|
|
89
|
+
img = input_dict['image']
|
|
90
|
+
if input_row.file_paths[img_slot_idx] is not None:
|
|
91
|
+
# we already have a local path
|
|
92
|
+
img_path = Path(input_row.file_paths[img_slot_idx])
|
|
93
|
+
# TODO: if the path leads to our tmp dir, we need to move the file
|
|
94
|
+
else:
|
|
95
|
+
# we need to create a local path
|
|
96
|
+
img_path = images_dir / f'{img_id}.jpg'
|
|
97
|
+
img.save(img_path)
|
|
98
|
+
|
|
99
|
+
images.append({
|
|
100
|
+
'id': img_id,
|
|
101
|
+
'file_name': str(img_path),
|
|
102
|
+
'width': img.width,
|
|
103
|
+
'height': img.height,
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
# create annotation records for this image
|
|
107
|
+
for annotation in input_dict['annotations']:
|
|
108
|
+
ann_id += 1
|
|
109
|
+
x, y, w, h = annotation['bbox']
|
|
110
|
+
category = annotation['category']
|
|
111
|
+
categories.add(category)
|
|
112
|
+
annotations.append({
|
|
113
|
+
'id': ann_id,
|
|
114
|
+
'image_id': img_id,
|
|
115
|
+
# we use the category name here and fix it up at the end, when we have assigned category ids
|
|
116
|
+
'category_id': category,
|
|
117
|
+
'bbox': annotation['bbox'],
|
|
118
|
+
'area': w * h,
|
|
119
|
+
'iscrowd': 0,
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
# replace category names with ids
|
|
123
|
+
category_ids = {category: id for id, category in enumerate(sorted(list(categories)))}
|
|
124
|
+
for annotation in annotations:
|
|
125
|
+
annotation['category_id'] = category_ids[annotation['category_id']]
|
|
126
|
+
|
|
127
|
+
result = {
|
|
128
|
+
'images': images,
|
|
129
|
+
'annotations': annotations,
|
|
130
|
+
'categories': [{'id': id, 'name': category} for category, id in category_ids.items()],
|
|
131
|
+
}
|
|
132
|
+
output_path = dest_path / 'data.json'
|
|
133
|
+
with open(output_path, 'w') as f:
|
|
134
|
+
json.dump(result, f)
|
|
135
|
+
return output_path
|
|
136
|
+
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Optional, Dict
|
|
2
|
+
import dataclasses
|
|
3
|
+
|
|
4
|
+
import pixeltable.type_system as ts
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclasses.dataclass
|
|
8
|
+
class DocumentHandle:
|
|
9
|
+
format: ts.DocumentType.DocumentFormat
|
|
10
|
+
bs_doc: Optional['bs4.BeautifulSoup'] = None
|
|
11
|
+
md_ast: Optional[Dict] = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_document_handle(s: str) -> Optional[DocumentHandle]:
|
|
15
|
+
bs_doc = get_html_handle(s)
|
|
16
|
+
if bs_doc is not None:
|
|
17
|
+
return DocumentHandle(format=ts.DocumentType.DocumentFormat.HTML, bs_doc=bs_doc)
|
|
18
|
+
md_ast = get_markdown_handle(s)
|
|
19
|
+
if md_ast is not None:
|
|
20
|
+
return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
def get_html_handle(s: str) -> Optional['bs4.BeautifulSoup']:
|
|
24
|
+
import bs4
|
|
25
|
+
try:
|
|
26
|
+
doc = bs4.BeautifulSoup(s, 'html.parser')
|
|
27
|
+
except Exception as e:
|
|
28
|
+
return None
|
|
29
|
+
if doc.find() is None:
|
|
30
|
+
return None
|
|
31
|
+
return doc
|
|
32
|
+
|
|
33
|
+
def get_markdown_handle(s: str) -> Optional[Dict]:
|
|
34
|
+
import mistune
|
|
35
|
+
try:
|
|
36
|
+
md_ast = mistune.create_markdown(renderer=None)
|
|
37
|
+
return md_ast(s)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
return None
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Optional, List, Tuple, Dict
|
|
3
|
+
from collections import OrderedDict, defaultdict, namedtuple
|
|
4
|
+
import os
|
|
5
|
+
import glob
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from time import time
|
|
8
|
+
import logging
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
import hashlib
|
|
11
|
+
|
|
12
|
+
from pixeltable.env import Env
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
_logger = logging.getLogger('pixeltable')
|
|
16
|
+
|
|
17
|
+
class CacheEntry:
|
|
18
|
+
def __init__(self, key: str, tbl_id: UUID, col_id: int, size: int, last_accessed_ts: int, ext: str):
|
|
19
|
+
self.key = key
|
|
20
|
+
self.tbl_id = tbl_id
|
|
21
|
+
self.col_id = col_id
|
|
22
|
+
self.size = size
|
|
23
|
+
self.last_accessed_ts = last_accessed_ts
|
|
24
|
+
self.ext = ext
|
|
25
|
+
|
|
26
|
+
def path(self) -> Path:
|
|
27
|
+
return Env.get().file_cache_dir / f'{self.tbl_id.hex}_{self.col_id}_{self.key}{self.ext}'
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def from_file(cls, path: Path) -> CacheEntry:
|
|
31
|
+
components = path.stem.split('_')
|
|
32
|
+
assert len(components) == 3
|
|
33
|
+
tbl_id = UUID(components[0])
|
|
34
|
+
col_id = int(components[1])
|
|
35
|
+
key = components[2]
|
|
36
|
+
file_info = os.stat(str(path))
|
|
37
|
+
return cls(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class FileCache:
|
|
41
|
+
"""
|
|
42
|
+
A local cache of external (eg, S3) file references in cells of a stored table (ie, table or view).
|
|
43
|
+
|
|
44
|
+
Cache entries are identified by a hash of the file url and stored in Env.filecache_dir. The time of last
|
|
45
|
+
access of a cache entries is its file's mtime.
|
|
46
|
+
|
|
47
|
+
TODO:
|
|
48
|
+
- enforce a maximum capacity with LRU eviction
|
|
49
|
+
- implement MRU eviction for queries that exceed the capacity
|
|
50
|
+
"""
|
|
51
|
+
_instance: Optional[FileCache] = None
|
|
52
|
+
ColumnStats = namedtuple('FileCacheColumnStats', ['tbl_id', 'col_id', 'num_files', 'total_size'])
|
|
53
|
+
CacheStats = namedtuple(
|
|
54
|
+
'FileCacheStats', ['total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats'])
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def get(cls) -> FileCache:
|
|
58
|
+
if cls._instance is None:
|
|
59
|
+
cls._instance = cls()
|
|
60
|
+
return cls._instance
|
|
61
|
+
|
|
62
|
+
def __init__(self):
|
|
63
|
+
self.cache: OrderedDict[str, CacheEntry] = OrderedDict() # ordered by entry.last_accessed_ts
|
|
64
|
+
self.total_size = 0
|
|
65
|
+
#self.capacity = Env.get().max_filecache_size
|
|
66
|
+
self.num_requests = 0
|
|
67
|
+
self.num_hits = 0
|
|
68
|
+
self.num_evictions = 0
|
|
69
|
+
paths = glob.glob(str(Env.get().file_cache_dir / '*'))
|
|
70
|
+
entries = [CacheEntry.from_file(Path(path_str)) for path_str in paths]
|
|
71
|
+
# we need to insert entries in order of last_accessed_ts
|
|
72
|
+
entries.sort(key=lambda e: e.last_accessed_ts)
|
|
73
|
+
for entry in entries:
|
|
74
|
+
self.cache[entry.key] = entry
|
|
75
|
+
self.total_size += entry.size
|
|
76
|
+
|
|
77
|
+
def avg_file_size(self) -> int:
|
|
78
|
+
if len(self.cache) == 0:
|
|
79
|
+
return 0
|
|
80
|
+
return int(self.total_size / len(self.cache))
|
|
81
|
+
|
|
82
|
+
def num_files(self, tbl_id: Optional[UUID] = None) -> int:
|
|
83
|
+
if tbl_id is None:
|
|
84
|
+
return len(self.cache)
|
|
85
|
+
entries = [e for e in self.cache.values() if e.tbl_id == tbl_id]
|
|
86
|
+
return len(entries)
|
|
87
|
+
|
|
88
|
+
def clear(self, tbl_id: Optional[UUID] = None, capacity: Optional[int] = None) -> None:
|
|
89
|
+
"""
|
|
90
|
+
For testing purposes: allow resetting capacity and stats.
|
|
91
|
+
"""
|
|
92
|
+
self.num_requests, self.num_hits, self.num_evictions = 0, 0, 0
|
|
93
|
+
entries = list(self.cache.values()) # list(): avoid dealing with values() return type
|
|
94
|
+
if tbl_id is not None:
|
|
95
|
+
entries = [e for e in entries if e.tbl_id == tbl_id]
|
|
96
|
+
_logger.debug(f'clearing {len(entries)} entries from file cache for table {tbl_id}')
|
|
97
|
+
else:
|
|
98
|
+
_logger.debug(f'clearing {len(entries)} entries from file cache')
|
|
99
|
+
for entry in entries:
|
|
100
|
+
del self.cache[entry.key]
|
|
101
|
+
self.total_size -= entry.size
|
|
102
|
+
os.remove(entry.path())
|
|
103
|
+
# if capacity is not None:
|
|
104
|
+
# self.capacity = capacity
|
|
105
|
+
# else:
|
|
106
|
+
# # need to reset to default
|
|
107
|
+
# self.capacity = Env.get().max_filecache_size
|
|
108
|
+
# _logger.debug(f'setting file cache capacity to {self.capacity}')
|
|
109
|
+
|
|
110
|
+
def _url_hash(self, url: str) -> str:
|
|
111
|
+
h = hashlib.sha256()
|
|
112
|
+
h.update(url.encode())
|
|
113
|
+
return h.hexdigest()
|
|
114
|
+
|
|
115
|
+
def lookup(self, url: str) -> Optional[Path]:
|
|
116
|
+
self.num_requests += 1
|
|
117
|
+
key = self._url_hash(url)
|
|
118
|
+
entry = self.cache.get(key, None)
|
|
119
|
+
if entry is None:
|
|
120
|
+
_logger.debug(f'file cache miss for {url}')
|
|
121
|
+
return None
|
|
122
|
+
# update mtime and cache
|
|
123
|
+
path = entry.path()
|
|
124
|
+
path.touch(exist_ok=True)
|
|
125
|
+
file_info = os.stat(str(path))
|
|
126
|
+
entry.last_accessed_ts = file_info.st_mtime
|
|
127
|
+
self.cache.move_to_end(key, last=True)
|
|
128
|
+
self.num_hits += 1
|
|
129
|
+
_logger.debug(f'file cache hit for {url}')
|
|
130
|
+
return path
|
|
131
|
+
|
|
132
|
+
# def can_admit(self, query_ts: int) -> bool:
|
|
133
|
+
# if self.total_size + self.avg_file_size <= self.capacity:
|
|
134
|
+
# return True
|
|
135
|
+
# assert len(self.cache) > 0
|
|
136
|
+
# # check whether we can evict the current lru entry
|
|
137
|
+
# lru_entry = next(iter(self.cache.values()))
|
|
138
|
+
# if lru_entry.last_accessed_ts >= query_ts:
|
|
139
|
+
# # the current query brought this entry in: we're not going to evict it
|
|
140
|
+
# return False
|
|
141
|
+
# return True
|
|
142
|
+
|
|
143
|
+
def add(self, tbl_id: UUID, col_id: int, url: str, path: Path) -> Path:
|
|
144
|
+
"""Adds url at 'path' to cache and returns its new path.
|
|
145
|
+
'path' will not be accessible after this call. Retains the extension of 'path'.
|
|
146
|
+
"""
|
|
147
|
+
file_info = os.stat(str(path))
|
|
148
|
+
_ = time()
|
|
149
|
+
#if self.total_size + file_info.st_size > self.capacity:
|
|
150
|
+
if False:
|
|
151
|
+
if len(self.cache) == 0:
|
|
152
|
+
# nothing to evict
|
|
153
|
+
return
|
|
154
|
+
# evict entries until we're below the limit or until we run into entries the current query brought in
|
|
155
|
+
while True:
|
|
156
|
+
lru_entry = next(iter(self.cache.values()))
|
|
157
|
+
if lru_entry.last_accessed_ts >= query_ts:
|
|
158
|
+
# the current query brought this entry in: switch to MRU and ignore this put()
|
|
159
|
+
_logger.debug('file cache switched to MRU')
|
|
160
|
+
return
|
|
161
|
+
self.cache.popitem(last=False)
|
|
162
|
+
self.total_size -= lru_entry.size
|
|
163
|
+
self.num_evictions += 1
|
|
164
|
+
os.remove(str(lru_entry.path()))
|
|
165
|
+
_logger.debug(f'evicted entry for cell {lru_entry.cell_id} from file cache')
|
|
166
|
+
if self.total_size + file_info.st_size <= self.capacity:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
key = self._url_hash(url)
|
|
170
|
+
assert key not in self.cache
|
|
171
|
+
entry = CacheEntry(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
|
|
172
|
+
self.cache[key] = entry
|
|
173
|
+
self.total_size += entry.size
|
|
174
|
+
new_path = entry.path()
|
|
175
|
+
os.rename(str(path), str(new_path))
|
|
176
|
+
_logger.debug(f'added entry for cell {url} to file cache')
|
|
177
|
+
return new_path
|
|
178
|
+
|
|
179
|
+
def stats(self) -> CacheStats:
|
|
180
|
+
# collect column stats
|
|
181
|
+
# (tbl_id, col_id) -> (num_files, total_size)
|
|
182
|
+
d: Dict[Tuple[int, int], List[int]] = defaultdict(lambda: [0, 0])
|
|
183
|
+
for entry in self.cache.values():
|
|
184
|
+
t = d[(entry.tbl_id, entry.col_id)]
|
|
185
|
+
t[0] += 1
|
|
186
|
+
t[1] += entry.size
|
|
187
|
+
col_stats = [
|
|
188
|
+
self.ColumnStats(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()
|
|
189
|
+
]
|
|
190
|
+
col_stats.sort(key=lambda e: e[3], reverse=True)
|
|
191
|
+
return self.CacheStats(self.total_size, self.num_requests, self.num_hits, self.num_evictions, col_stats)
|
|
192
|
+
|
|
193
|
+
def debug_print(self) -> None:
|
|
194
|
+
for entry in self.cache.values():
|
|
195
|
+
print(f'CacheEntry: tbl_id={entry.tbl_id}, col_id={entry.col_id}, size={entry.size}')
|
pixeltable/utils/help.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import glob
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import shutil
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import Optional, List, Tuple, Dict
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
|
|
11
|
+
from pixeltable.env import Env
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MediaStore:
|
|
15
|
+
"""
|
|
16
|
+
Utilities to manage media files stored in Env.media_dir
|
|
17
|
+
|
|
18
|
+
Media file names are a composite of: table id, column id, version, uuid:
|
|
19
|
+
the table id/column id/version are redundant but useful for identifying all files for a table
|
|
20
|
+
or all files created for a particular version of a table
|
|
21
|
+
"""
|
|
22
|
+
pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def prepare_media_path(cls, tbl_id: UUID, col_id: int, version: int, ext: Optional[str] = None) -> Path:
|
|
26
|
+
"""
|
|
27
|
+
Construct a new, unique Path name for a persisted media file, and create the parent directory
|
|
28
|
+
for the new Path if it does not already exist. The Path will reside in
|
|
29
|
+
the environment's media_dir.
|
|
30
|
+
"""
|
|
31
|
+
id_hex = uuid.uuid4().hex
|
|
32
|
+
parent = Env.get().media_dir / tbl_id.hex / id_hex[0:2] / id_hex[0:4]
|
|
33
|
+
parent.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
return parent / f'{tbl_id.hex}_{col_id}_{version}_{id_hex}{ext or ""}'
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def delete(cls, tbl_id: UUID, version: Optional[int] = None) -> None:
|
|
38
|
+
"""Delete all files belonging to tbl_id. If version is not None, delete
|
|
39
|
+
only those files belonging to the specified version."""
|
|
40
|
+
assert tbl_id is not None
|
|
41
|
+
if version is None:
|
|
42
|
+
# Remove the entire folder for this table id.
|
|
43
|
+
path = Env.get().media_dir / tbl_id.hex
|
|
44
|
+
if path.exists():
|
|
45
|
+
shutil.rmtree(path)
|
|
46
|
+
else:
|
|
47
|
+
# Remove only the elements for the specified version.
|
|
48
|
+
paths = glob.glob(str(Env.get().media_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{version}_*', recursive=True)
|
|
49
|
+
for path in paths:
|
|
50
|
+
os.remove(path)
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def count(cls, tbl_id: UUID) -> int:
|
|
54
|
+
"""
|
|
55
|
+
Return number of files for given tbl_id.
|
|
56
|
+
"""
|
|
57
|
+
paths = glob.glob(str(Env.get().media_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
|
|
58
|
+
return len(paths)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def stats(cls) -> List[Tuple[int, int, int, int]]:
|
|
62
|
+
paths = glob.glob(str(Env.get().media_dir) + "/**", recursive=True)
|
|
63
|
+
# key: (tbl_id, col_id), value: (num_files, size)
|
|
64
|
+
d: Dict[Tuple[UUID, int], List[int]] = defaultdict(lambda: [0, 0])
|
|
65
|
+
for p in paths:
|
|
66
|
+
if not os.path.isdir(p):
|
|
67
|
+
matched = re.match(cls.pattern, Path(p).name)
|
|
68
|
+
assert matched is not None
|
|
69
|
+
tbl_id, col_id = UUID(hex=matched[1]), int(matched[2])
|
|
70
|
+
file_info = os.stat(p)
|
|
71
|
+
t = d[(tbl_id, col_id)]
|
|
72
|
+
t[0] += 1
|
|
73
|
+
t[1] += file_info.st_size
|
|
74
|
+
result = [(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()]
|
|
75
|
+
result.sort(key=lambda e: e[3], reverse=True)
|
|
76
|
+
return result
|