pixeltable 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (139) hide show
  1. pixeltable/__init__.py +34 -6
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +520 -30
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +373 -45
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +113 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +187 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +61 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +88 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +27 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +413 -182
  88. pixeltable/tests/conftest.py +143 -87
  89. pixeltable/tests/test_audio.py +65 -0
  90. pixeltable/tests/test_catalog.py +27 -0
  91. pixeltable/tests/test_client.py +14 -14
  92. pixeltable/tests/test_component_view.py +372 -0
  93. pixeltable/tests/test_dataframe.py +433 -0
  94. pixeltable/tests/test_dirs.py +78 -62
  95. pixeltable/tests/test_document.py +117 -0
  96. pixeltable/tests/test_exprs.py +591 -135
  97. pixeltable/tests/test_function.py +297 -67
  98. pixeltable/tests/test_functions.py +283 -1
  99. pixeltable/tests/test_migration.py +43 -0
  100. pixeltable/tests/test_nos.py +54 -0
  101. pixeltable/tests/test_snapshot.py +208 -0
  102. pixeltable/tests/test_table.py +1085 -262
  103. pixeltable/tests/test_transactional_directory.py +42 -0
  104. pixeltable/tests/test_types.py +5 -11
  105. pixeltable/tests/test_video.py +149 -34
  106. pixeltable/tests/test_view.py +530 -0
  107. pixeltable/tests/utils.py +186 -45
  108. pixeltable/tool/create_test_db_dump.py +149 -0
  109. pixeltable/type_system.py +490 -126
  110. pixeltable/utils/__init__.py +17 -46
  111. pixeltable/utils/clip.py +12 -15
  112. pixeltable/utils/coco.py +136 -0
  113. pixeltable/utils/documents.py +39 -0
  114. pixeltable/utils/filecache.py +195 -0
  115. pixeltable/utils/help.py +11 -0
  116. pixeltable/utils/media_store.py +76 -0
  117. pixeltable/utils/parquet.py +126 -0
  118. pixeltable/utils/pytorch.py +172 -0
  119. pixeltable/utils/s3.py +13 -0
  120. pixeltable/utils/sql.py +17 -0
  121. pixeltable/utils/transactional_directory.py +35 -0
  122. pixeltable-0.2.0.dist-info/LICENSE +18 -0
  123. pixeltable-0.2.0.dist-info/METADATA +117 -0
  124. pixeltable-0.2.0.dist-info/RECORD +125 -0
  125. {pixeltable-0.1.1.dist-info → pixeltable-0.2.0.dist-info}/WHEEL +1 -1
  126. pixeltable/catalog.py +0 -1421
  127. pixeltable/exprs.py +0 -1745
  128. pixeltable/function.py +0 -269
  129. pixeltable/functions/clip.py +0 -10
  130. pixeltable/functions/pil/__init__.py +0 -23
  131. pixeltable/functions/tf.py +0 -21
  132. pixeltable/index.py +0 -57
  133. pixeltable/tests/test_dict.py +0 -24
  134. pixeltable/tests/test_tf.py +0 -69
  135. pixeltable/tf.py +0 -33
  136. pixeltable/utils/tf.py +0 -33
  137. pixeltable/utils/video.py +0 -32
  138. pixeltable-0.1.1.dist-info/METADATA +0 -31
  139. pixeltable-0.1.1.dist-info/RECORD +0 -36
@@ -1,46 +1,17 @@
1
- from typing import Optional
2
- from pathlib import Path
3
- import glob
4
-
5
- from pixeltable.env import Env
6
-
7
-
8
- def is_computed_img_path(path: str) -> bool:
9
- try:
10
- _ = Path(path).relative_to(Env.get().img_dir)
11
- return True
12
- except ValueError:
13
- return False
14
-
15
- def get_computed_img_path(tbl_id: int, col_id: int, version: int, rowid: int) -> str:
16
- return Env.get().img_dir / f'img_{tbl_id}_{col_id}_{version}_{rowid}.jpg'
17
-
18
- def get_extracted_frame_path(tbl_id: int, video_col_id: int, version: int, offset: int) -> str:
19
- return Env.get().img_dir / f'frame_{tbl_id}_{video_col_id}_{version}_{offset}'
20
-
21
- def computed_imgs(
22
- tbl_id: Optional[int] = None, col_id: Optional[int] = None, version: Optional[int] = None) -> int:
23
- path = f'{Env.get().img_dir}/img_'
24
- path += f'{tbl_id}_' if tbl_id is not None else '*_'
25
- path += f'{col_id}_' if col_id is not None else '*_'
26
- path += f'{version}_' if version is not None else '*_'
27
- path += '*'
28
- names = glob.glob(path)
29
- return names
30
-
31
- def computed_img_count(
32
- tbl_id: Optional[int] = None, col_id: Optional[int] = None, version: Optional[int] = None) -> int:
33
- return len(computed_imgs(tbl_id=tbl_id, col_id=col_id, version=version))
34
-
35
- def extracted_frames(tbl_id: Optional[int] = None, version: Optional[int] = None) -> int:
36
- path = f'{Env.get().img_dir}/frame_'
37
- path += f'{tbl_id}_' if tbl_id is not None else '*_'
38
- path += '*_' # video_col_id
39
- path += f'{version}_' if version is not None else '*_'
40
- path += '*_' # offset
41
- path += '*' # running frame index
42
- names = glob.glob(path)
43
- return names
44
-
45
- def extracted_frame_count(tbl_id: Optional[int] = None, version: Optional[int] = None) -> int:
46
- return len(extracted_frames(tbl_id, version))
1
+ def print_perf_counter_delta(delta: float) -> str:
2
+ """Prints a performance counter delta in a human-readable format.
3
+
4
+ Args:
5
+ delta: delta in seconds
6
+
7
+ Returns:
8
+ Human-readable string
9
+ """
10
+ if delta < 1e-6:
11
+ return f'{delta * 1e9:.2f} ns'
12
+ elif delta < 1e-3:
13
+ return f'{delta * 1e6:.2f} us'
14
+ elif delta < 1:
15
+ return f'{delta * 1e3:.2f} ms'
16
+ else:
17
+ return f'{delta:.2f} s'
pixeltable/utils/clip.py CHANGED
@@ -1,21 +1,18 @@
1
1
  import numpy as np
2
2
  import PIL.Image
3
- import clip
4
- import torch
5
- import PIL.Image
6
3
 
4
+ import pixeltable.func as func
5
+ from pixeltable.env import Env
7
6
 
8
- _device = 'cuda' if torch.cuda.is_available() else 'cpu'
9
- _model, _preprocess = clip.load("ViT-B/32", device=_device)
10
7
 
11
- def encode_image(img: PIL.Image.Image) -> np.ndarray:
12
- preprocessed = _preprocess(img).unsqueeze(0).to(_device)
13
- features = _model.encode_image(preprocessed)
14
- val = features.numpy(force=True).squeeze()
15
- return val
8
+ def embed_image(img: PIL.Image.Image) -> np.ndarray:
9
+ from pixeltable.functions.nos.image_embedding import openai_clip
10
+ model_info = openai_clip.model_spec
11
+ result = Env.get().nos_client.Run(task=model_info.task, model_name=model_info.name, images=[img.resize((224, 224))])
12
+ return result['embedding'].squeeze(0)
16
13
 
17
- def encode_text(txt: str) -> np.ndarray:
18
- preprocessed = clip.tokenize([txt]).to(_device)
19
- features = _model.encode_text(preprocessed)
20
- val = features.numpy(force=True).squeeze()
21
- return val
14
+ def embed_text(text: str) -> np.ndarray:
15
+ from pixeltable.functions.nos.text_embedding import openai_clip
16
+ model_info = openai_clip.model_spec
17
+ result = Env.get().nos_client.Run(task=model_info.task, model_name=model_info.name, texts=[text])
18
+ return result['embedding'].squeeze(0)
@@ -0,0 +1,136 @@
1
+ from typing import List, Dict, Any, Set
2
+ from pathlib import Path
3
+ import json
4
+
5
+ import PIL
6
+
7
+ import pixeltable.exceptions as excs
8
+
9
+
10
+ format_msg = """
11
+
12
+ Required format:
13
+ {
14
+ 'image': PIL.Image.Image,
15
+ 'annotations': [
16
+ {
17
+ 'bbox': [x: int, y: int, w: int, h: int],
18
+ 'category': str | int,
19
+ },
20
+ ...
21
+ ],
22
+ }
23
+ """
24
+
25
+ def _verify_input_dict(input_dict: Dict[str, Any]) -> None:
26
+ """Verify that input_dict is a valid input dict for write_coco_dataset()"""
27
+ if not isinstance(input_dict, dict):
28
+ raise excs.Error(f'Expected dict, got {input_dict}{format_msg}')
29
+ if 'image' not in input_dict:
30
+ raise excs.Error(f'Missing key "image" in input dict: {input_dict}{format_msg}')
31
+ if not isinstance(input_dict['image'], PIL.Image.Image):
32
+ raise excs.Error(f'Value for "image" is not a PIL.Image.Image: {input_dict}{format_msg}')
33
+ if 'annotations' not in input_dict:
34
+ raise excs.Error(f'Missing key "annotations" in input dict: {input_dict}{format_msg}')
35
+ if not isinstance(input_dict['annotations'], list):
36
+ raise excs.Error(f'Value for "annotations" is not a list: {input_dict}{format_msg}')
37
+ for annotation in input_dict['annotations']:
38
+ if not isinstance(annotation, dict):
39
+ raise excs.Error(f'Annotation is not a dict: {annotation}{format_msg}')
40
+ if 'bbox' not in annotation:
41
+ raise excs.Error(f'Missing key "bbox" in annotation: {annotation}{format_msg}')
42
+ if not isinstance(annotation['bbox'], list):
43
+ raise excs.Error(f'Value for "bbox" is not a list [x, y, w, h]: {annotation}{format_msg}')
44
+ if len(annotation['bbox']) != 4 or not all(isinstance(x, int) for x in annotation['bbox']):
45
+ raise excs.Error(f'Key "bbox" is not a list [x, y, w, h] of ints: {annotation}{format_msg}')
46
+ if 'category' not in annotation:
47
+ raise excs.Error(f'Missing key "category" in annotation: {annotation}{format_msg}')
48
+ if not isinstance(annotation['category'], (str, int)):
49
+ raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
50
+
51
+ def write_coco_dataset(df: 'pixeltable.DataFrame', dest_path: Path) -> Path:
52
+ """Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
53
+ # TODO: validate schema
54
+ if len(df._select_list_exprs) != 1 or not df._select_list_exprs[0].col_type.is_json_type():
55
+ raise excs.Error(f'Expected exactly one json-typed column in select list: {df._select_list_exprs}')
56
+ input_dict_slot_idx = -1 # df._select_list_exprs[0].slot_idx isn't valid until _exec()
57
+
58
+ # create output dir
59
+ assert not dest_path.exists()
60
+ dest_path.mkdir(parents=False)
61
+ images_dir = dest_path / 'images'
62
+ images_dir.mkdir()
63
+
64
+ images: List[Dict[str, Any]] = []
65
+ img_id = -1
66
+ annotations: List[Dict[str, Any]] = []
67
+ ann_id = -1
68
+ categories: Set[Any] = set()
69
+ for input_row in df._exec():
70
+ if input_dict_slot_idx == -1:
71
+ input_dict_expr = df._select_list_exprs[0]
72
+ input_dict_slot_idx = input_dict_expr.slot_idx
73
+ input_dict = input_row[input_dict_slot_idx]
74
+ _verify_input_dict(input_dict)
75
+
76
+ # we want to know the slot idx of the image used in the input dict, so that we can check whether we
77
+ # already have a local path for it
78
+ input_dict_dependencies = input_dict_expr.dependencies()
79
+ img_slot_idx = next((e.slot_idx for e in input_dict_dependencies if e.col_type.is_image_type()), None)
80
+ assert img_slot_idx is not None
81
+ else:
82
+ input_dict = input_row[input_dict_slot_idx]
83
+ _verify_input_dict(input_dict)
84
+
85
+ # create image record
86
+ img_id += 1
87
+
88
+ # get a local path for the image
89
+ img = input_dict['image']
90
+ if input_row.file_paths[img_slot_idx] is not None:
91
+ # we already have a local path
92
+ img_path = Path(input_row.file_paths[img_slot_idx])
93
+ # TODO: if the path leads to our tmp dir, we need to move the file
94
+ else:
95
+ # we need to create a local path
96
+ img_path = images_dir / f'{img_id}.jpg'
97
+ img.save(img_path)
98
+
99
+ images.append({
100
+ 'id': img_id,
101
+ 'file_name': str(img_path),
102
+ 'width': img.width,
103
+ 'height': img.height,
104
+ })
105
+
106
+ # create annotation records for this image
107
+ for annotation in input_dict['annotations']:
108
+ ann_id += 1
109
+ x, y, w, h = annotation['bbox']
110
+ category = annotation['category']
111
+ categories.add(category)
112
+ annotations.append({
113
+ 'id': ann_id,
114
+ 'image_id': img_id,
115
+ # we use the category name here and fix it up at the end, when we have assigned category ids
116
+ 'category_id': category,
117
+ 'bbox': annotation['bbox'],
118
+ 'area': w * h,
119
+ 'iscrowd': 0,
120
+ })
121
+
122
+ # replace category names with ids
123
+ category_ids = {category: id for id, category in enumerate(sorted(list(categories)))}
124
+ for annotation in annotations:
125
+ annotation['category_id'] = category_ids[annotation['category_id']]
126
+
127
+ result = {
128
+ 'images': images,
129
+ 'annotations': annotations,
130
+ 'categories': [{'id': id, 'name': category} for category, id in category_ids.items()],
131
+ }
132
+ output_path = dest_path / 'data.json'
133
+ with open(output_path, 'w') as f:
134
+ json.dump(result, f)
135
+ return output_path
136
+
@@ -0,0 +1,39 @@
1
+ from typing import Optional, Dict
2
+ import dataclasses
3
+
4
+ import pixeltable.type_system as ts
5
+
6
+
7
+ @dataclasses.dataclass
8
+ class DocumentHandle:
9
+ format: ts.DocumentType.DocumentFormat
10
+ bs_doc: Optional['bs4.BeautifulSoup'] = None
11
+ md_ast: Optional[Dict] = None
12
+
13
+
14
+ def get_document_handle(s: str) -> Optional[DocumentHandle]:
15
+ bs_doc = get_html_handle(s)
16
+ if bs_doc is not None:
17
+ return DocumentHandle(format=ts.DocumentType.DocumentFormat.HTML, bs_doc=bs_doc)
18
+ md_ast = get_markdown_handle(s)
19
+ if md_ast is not None:
20
+ return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
21
+ return None
22
+
23
+ def get_html_handle(s: str) -> Optional['bs4.BeautifulSoup']:
24
+ import bs4
25
+ try:
26
+ doc = bs4.BeautifulSoup(s, 'html.parser')
27
+ except Exception as e:
28
+ return None
29
+ if doc.find() is None:
30
+ return None
31
+ return doc
32
+
33
+ def get_markdown_handle(s: str) -> Optional[Dict]:
34
+ import mistune
35
+ try:
36
+ md_ast = mistune.create_markdown(renderer=None)
37
+ return md_ast(s)
38
+ except Exception as e:
39
+ return None
@@ -0,0 +1,195 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, List, Tuple, Dict
3
+ from collections import OrderedDict, defaultdict, namedtuple
4
+ import os
5
+ import glob
6
+ from pathlib import Path
7
+ from time import time
8
+ import logging
9
+ from uuid import UUID
10
+ import hashlib
11
+
12
+ from pixeltable.env import Env
13
+
14
+
15
+ _logger = logging.getLogger('pixeltable')
16
+
17
+ class CacheEntry:
18
+ def __init__(self, key: str, tbl_id: UUID, col_id: int, size: int, last_accessed_ts: int, ext: str):
19
+ self.key = key
20
+ self.tbl_id = tbl_id
21
+ self.col_id = col_id
22
+ self.size = size
23
+ self.last_accessed_ts = last_accessed_ts
24
+ self.ext = ext
25
+
26
+ def path(self) -> Path:
27
+ return Env.get().file_cache_dir / f'{self.tbl_id.hex}_{self.col_id}_{self.key}{self.ext}'
28
+
29
+ @classmethod
30
+ def from_file(cls, path: Path) -> CacheEntry:
31
+ components = path.stem.split('_')
32
+ assert len(components) == 3
33
+ tbl_id = UUID(components[0])
34
+ col_id = int(components[1])
35
+ key = components[2]
36
+ file_info = os.stat(str(path))
37
+ return cls(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
38
+
39
+
40
+ class FileCache:
41
+ """
42
+ A local cache of external (eg, S3) file references in cells of a stored table (ie, table or view).
43
+
44
+ Cache entries are identified by a hash of the file url and stored in Env.filecache_dir. The time of last
45
+ access of a cache entries is its file's mtime.
46
+
47
+ TODO:
48
+ - enforce a maximum capacity with LRU eviction
49
+ - implement MRU eviction for queries that exceed the capacity
50
+ """
51
+ _instance: Optional[FileCache] = None
52
+ ColumnStats = namedtuple('FileCacheColumnStats', ['tbl_id', 'col_id', 'num_files', 'total_size'])
53
+ CacheStats = namedtuple(
54
+ 'FileCacheStats', ['total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats'])
55
+
56
+ @classmethod
57
+ def get(cls) -> FileCache:
58
+ if cls._instance is None:
59
+ cls._instance = cls()
60
+ return cls._instance
61
+
62
+ def __init__(self):
63
+ self.cache: OrderedDict[str, CacheEntry] = OrderedDict() # ordered by entry.last_accessed_ts
64
+ self.total_size = 0
65
+ #self.capacity = Env.get().max_filecache_size
66
+ self.num_requests = 0
67
+ self.num_hits = 0
68
+ self.num_evictions = 0
69
+ paths = glob.glob(str(Env.get().file_cache_dir / '*'))
70
+ entries = [CacheEntry.from_file(Path(path_str)) for path_str in paths]
71
+ # we need to insert entries in order of last_accessed_ts
72
+ entries.sort(key=lambda e: e.last_accessed_ts)
73
+ for entry in entries:
74
+ self.cache[entry.key] = entry
75
+ self.total_size += entry.size
76
+
77
+ def avg_file_size(self) -> int:
78
+ if len(self.cache) == 0:
79
+ return 0
80
+ return int(self.total_size / len(self.cache))
81
+
82
+ def num_files(self, tbl_id: Optional[UUID] = None) -> int:
83
+ if tbl_id is None:
84
+ return len(self.cache)
85
+ entries = [e for e in self.cache.values() if e.tbl_id == tbl_id]
86
+ return len(entries)
87
+
88
+ def clear(self, tbl_id: Optional[UUID] = None, capacity: Optional[int] = None) -> None:
89
+ """
90
+ For testing purposes: allow resetting capacity and stats.
91
+ """
92
+ self.num_requests, self.num_hits, self.num_evictions = 0, 0, 0
93
+ entries = list(self.cache.values()) # list(): avoid dealing with values() return type
94
+ if tbl_id is not None:
95
+ entries = [e for e in entries if e.tbl_id == tbl_id]
96
+ _logger.debug(f'clearing {len(entries)} entries from file cache for table {tbl_id}')
97
+ else:
98
+ _logger.debug(f'clearing {len(entries)} entries from file cache')
99
+ for entry in entries:
100
+ del self.cache[entry.key]
101
+ self.total_size -= entry.size
102
+ os.remove(entry.path())
103
+ # if capacity is not None:
104
+ # self.capacity = capacity
105
+ # else:
106
+ # # need to reset to default
107
+ # self.capacity = Env.get().max_filecache_size
108
+ # _logger.debug(f'setting file cache capacity to {self.capacity}')
109
+
110
+ def _url_hash(self, url: str) -> str:
111
+ h = hashlib.sha256()
112
+ h.update(url.encode())
113
+ return h.hexdigest()
114
+
115
+ def lookup(self, url: str) -> Optional[Path]:
116
+ self.num_requests += 1
117
+ key = self._url_hash(url)
118
+ entry = self.cache.get(key, None)
119
+ if entry is None:
120
+ _logger.debug(f'file cache miss for {url}')
121
+ return None
122
+ # update mtime and cache
123
+ path = entry.path()
124
+ path.touch(exist_ok=True)
125
+ file_info = os.stat(str(path))
126
+ entry.last_accessed_ts = file_info.st_mtime
127
+ self.cache.move_to_end(key, last=True)
128
+ self.num_hits += 1
129
+ _logger.debug(f'file cache hit for {url}')
130
+ return path
131
+
132
+ # def can_admit(self, query_ts: int) -> bool:
133
+ # if self.total_size + self.avg_file_size <= self.capacity:
134
+ # return True
135
+ # assert len(self.cache) > 0
136
+ # # check whether we can evict the current lru entry
137
+ # lru_entry = next(iter(self.cache.values()))
138
+ # if lru_entry.last_accessed_ts >= query_ts:
139
+ # # the current query brought this entry in: we're not going to evict it
140
+ # return False
141
+ # return True
142
+
143
+ def add(self, tbl_id: UUID, col_id: int, url: str, path: Path) -> Path:
144
+ """Adds url at 'path' to cache and returns its new path.
145
+ 'path' will not be accessible after this call. Retains the extension of 'path'.
146
+ """
147
+ file_info = os.stat(str(path))
148
+ _ = time()
149
+ #if self.total_size + file_info.st_size > self.capacity:
150
+ if False:
151
+ if len(self.cache) == 0:
152
+ # nothing to evict
153
+ return
154
+ # evict entries until we're below the limit or until we run into entries the current query brought in
155
+ while True:
156
+ lru_entry = next(iter(self.cache.values()))
157
+ if lru_entry.last_accessed_ts >= query_ts:
158
+ # the current query brought this entry in: switch to MRU and ignore this put()
159
+ _logger.debug('file cache switched to MRU')
160
+ return
161
+ self.cache.popitem(last=False)
162
+ self.total_size -= lru_entry.size
163
+ self.num_evictions += 1
164
+ os.remove(str(lru_entry.path()))
165
+ _logger.debug(f'evicted entry for cell {lru_entry.cell_id} from file cache')
166
+ if self.total_size + file_info.st_size <= self.capacity:
167
+ break
168
+
169
+ key = self._url_hash(url)
170
+ assert key not in self.cache
171
+ entry = CacheEntry(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
172
+ self.cache[key] = entry
173
+ self.total_size += entry.size
174
+ new_path = entry.path()
175
+ os.rename(str(path), str(new_path))
176
+ _logger.debug(f'added entry for cell {url} to file cache')
177
+ return new_path
178
+
179
+ def stats(self) -> CacheStats:
180
+ # collect column stats
181
+ # (tbl_id, col_id) -> (num_files, total_size)
182
+ d: Dict[Tuple[int, int], List[int]] = defaultdict(lambda: [0, 0])
183
+ for entry in self.cache.values():
184
+ t = d[(entry.tbl_id, entry.col_id)]
185
+ t[0] += 1
186
+ t[1] += entry.size
187
+ col_stats = [
188
+ self.ColumnStats(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()
189
+ ]
190
+ col_stats.sort(key=lambda e: e[3], reverse=True)
191
+ return self.CacheStats(self.total_size, self.num_requests, self.num_hits, self.num_evictions, col_stats)
192
+
193
+ def debug_print(self) -> None:
194
+ for entry in self.cache.values():
195
+ print(f'CacheEntry: tbl_id={entry.tbl_id}, col_id={entry.col_id}, size={entry.size}')
@@ -0,0 +1,11 @@
1
+ from typing import Any
2
+
3
+ import pixeltable.func as func
4
+
5
+
6
+ def help(obj: Any) -> None:
7
+ """Returns help text for the given object."""
8
+ if isinstance(obj, func.Function):
9
+ print(obj.help_str())
10
+ else:
11
+ print(__builtins__.help(obj))
@@ -0,0 +1,76 @@
1
+ import glob
2
+ import os
3
+ import re
4
+ import shutil
5
+ import uuid
6
+ from typing import Optional, List, Tuple, Dict
7
+ from pathlib import Path
8
+ from collections import defaultdict
9
+ from uuid import UUID
10
+
11
+ from pixeltable.env import Env
12
+
13
+
14
+ class MediaStore:
15
+ """
16
+ Utilities to manage media files stored in Env.media_dir
17
+
18
+ Media file names are a composite of: table id, column id, version, uuid:
19
+ the table id/column id/version are redundant but useful for identifying all files for a table
20
+ or all files created for a particular version of a table
21
+ """
22
+ pattern = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
23
+
24
+ @classmethod
25
+ def prepare_media_path(cls, tbl_id: UUID, col_id: int, version: int, ext: Optional[str] = None) -> Path:
26
+ """
27
+ Construct a new, unique Path name for a persisted media file, and create the parent directory
28
+ for the new Path if it does not already exist. The Path will reside in
29
+ the environment's media_dir.
30
+ """
31
+ id_hex = uuid.uuid4().hex
32
+ parent = Env.get().media_dir / tbl_id.hex / id_hex[0:2] / id_hex[0:4]
33
+ parent.mkdir(parents=True, exist_ok=True)
34
+ return parent / f'{tbl_id.hex}_{col_id}_{version}_{id_hex}{ext or ""}'
35
+
36
+ @classmethod
37
+ def delete(cls, tbl_id: UUID, version: Optional[int] = None) -> None:
38
+ """Delete all files belonging to tbl_id. If version is not None, delete
39
+ only those files belonging to the specified version."""
40
+ assert tbl_id is not None
41
+ if version is None:
42
+ # Remove the entire folder for this table id.
43
+ path = Env.get().media_dir / tbl_id.hex
44
+ if path.exists():
45
+ shutil.rmtree(path)
46
+ else:
47
+ # Remove only the elements for the specified version.
48
+ paths = glob.glob(str(Env.get().media_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{version}_*', recursive=True)
49
+ for path in paths:
50
+ os.remove(path)
51
+
52
+ @classmethod
53
+ def count(cls, tbl_id: UUID) -> int:
54
+ """
55
+ Return number of files for given tbl_id.
56
+ """
57
+ paths = glob.glob(str(Env.get().media_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*', recursive=True)
58
+ return len(paths)
59
+
60
+ @classmethod
61
+ def stats(cls) -> List[Tuple[int, int, int, int]]:
62
+ paths = glob.glob(str(Env.get().media_dir) + "/**", recursive=True)
63
+ # key: (tbl_id, col_id), value: (num_files, size)
64
+ d: Dict[Tuple[UUID, int], List[int]] = defaultdict(lambda: [0, 0])
65
+ for p in paths:
66
+ if not os.path.isdir(p):
67
+ matched = re.match(cls.pattern, Path(p).name)
68
+ assert matched is not None
69
+ tbl_id, col_id = UUID(hex=matched[1]), int(matched[2])
70
+ file_info = os.stat(p)
71
+ t = d[(tbl_id, col_id)]
72
+ t[0] += 1
73
+ t[1] += file_info.st_size
74
+ result = [(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()]
75
+ result.sort(key=lambda e: e[3], reverse=True)
76
+ return result