pixeltable 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (119) hide show
  1. pixeltable/__init__.py +53 -0
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/__init__.py +13 -0
  4. pixeltable/catalog/catalog.py +159 -0
  5. pixeltable/catalog/column.py +181 -0
  6. pixeltable/catalog/dir.py +32 -0
  7. pixeltable/catalog/globals.py +33 -0
  8. pixeltable/catalog/insertable_table.py +192 -0
  9. pixeltable/catalog/named_function.py +36 -0
  10. pixeltable/catalog/path.py +58 -0
  11. pixeltable/catalog/path_dict.py +139 -0
  12. pixeltable/catalog/schema_object.py +39 -0
  13. pixeltable/catalog/table.py +695 -0
  14. pixeltable/catalog/table_version.py +1026 -0
  15. pixeltable/catalog/table_version_path.py +133 -0
  16. pixeltable/catalog/view.py +203 -0
  17. pixeltable/dataframe.py +749 -0
  18. pixeltable/env.py +466 -0
  19. pixeltable/exceptions.py +17 -0
  20. pixeltable/exec/__init__.py +10 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +94 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +73 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +226 -0
  31. pixeltable/exprs/__init__.py +25 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +114 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +199 -0
  39. pixeltable/exprs/expr.py +594 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +382 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +96 -0
  44. pixeltable/exprs/in_predicate.py +96 -0
  45. pixeltable/exprs/inline_array.py +109 -0
  46. pixeltable/exprs/inline_dict.py +103 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +66 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +329 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/similarity_expr.py +65 -0
  56. pixeltable/exprs/type_cast.py +53 -0
  57. pixeltable/exprs/variable.py +45 -0
  58. pixeltable/ext/__init__.py +5 -0
  59. pixeltable/ext/functions/yolox.py +92 -0
  60. pixeltable/func/__init__.py +7 -0
  61. pixeltable/func/aggregate_function.py +197 -0
  62. pixeltable/func/callable_function.py +113 -0
  63. pixeltable/func/expr_template_function.py +99 -0
  64. pixeltable/func/function.py +141 -0
  65. pixeltable/func/function_registry.py +227 -0
  66. pixeltable/func/globals.py +46 -0
  67. pixeltable/func/nos_function.py +202 -0
  68. pixeltable/func/signature.py +162 -0
  69. pixeltable/func/udf.py +164 -0
  70. pixeltable/functions/__init__.py +95 -0
  71. pixeltable/functions/eval.py +215 -0
  72. pixeltable/functions/fireworks.py +34 -0
  73. pixeltable/functions/huggingface.py +167 -0
  74. pixeltable/functions/image.py +16 -0
  75. pixeltable/functions/openai.py +289 -0
  76. pixeltable/functions/pil/image.py +147 -0
  77. pixeltable/functions/string.py +13 -0
  78. pixeltable/functions/together.py +143 -0
  79. pixeltable/functions/util.py +52 -0
  80. pixeltable/functions/video.py +62 -0
  81. pixeltable/globals.py +425 -0
  82. pixeltable/index/__init__.py +2 -0
  83. pixeltable/index/base.py +51 -0
  84. pixeltable/index/embedding_index.py +168 -0
  85. pixeltable/io/__init__.py +3 -0
  86. pixeltable/io/hf_datasets.py +188 -0
  87. pixeltable/io/pandas.py +148 -0
  88. pixeltable/io/parquet.py +192 -0
  89. pixeltable/iterators/__init__.py +3 -0
  90. pixeltable/iterators/base.py +52 -0
  91. pixeltable/iterators/document.py +432 -0
  92. pixeltable/iterators/video.py +88 -0
  93. pixeltable/metadata/__init__.py +58 -0
  94. pixeltable/metadata/converters/convert_10.py +18 -0
  95. pixeltable/metadata/converters/convert_12.py +3 -0
  96. pixeltable/metadata/converters/convert_13.py +41 -0
  97. pixeltable/metadata/schema.py +234 -0
  98. pixeltable/plan.py +620 -0
  99. pixeltable/store.py +424 -0
  100. pixeltable/tool/create_test_db_dump.py +184 -0
  101. pixeltable/tool/create_test_video.py +81 -0
  102. pixeltable/type_system.py +846 -0
  103. pixeltable/utils/__init__.py +17 -0
  104. pixeltable/utils/arrow.py +98 -0
  105. pixeltable/utils/clip.py +18 -0
  106. pixeltable/utils/coco.py +136 -0
  107. pixeltable/utils/documents.py +69 -0
  108. pixeltable/utils/filecache.py +195 -0
  109. pixeltable/utils/help.py +11 -0
  110. pixeltable/utils/http_server.py +70 -0
  111. pixeltable/utils/media_store.py +76 -0
  112. pixeltable/utils/pytorch.py +91 -0
  113. pixeltable/utils/s3.py +13 -0
  114. pixeltable/utils/sql.py +17 -0
  115. pixeltable/utils/transactional_directory.py +35 -0
  116. pixeltable-0.0.0.dist-info/LICENSE +18 -0
  117. pixeltable-0.0.0.dist-info/METADATA +131 -0
  118. pixeltable-0.0.0.dist-info/RECORD +119 -0
  119. pixeltable-0.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,17 @@
1
+ def print_perf_counter_delta(delta: float) -> str:
2
+ """Prints a performance counter delta in a human-readable format.
3
+
4
+ Args:
5
+ delta: delta in seconds
6
+
7
+ Returns:
8
+ Human-readable string
9
+ """
10
+ if delta < 1e-6:
11
+ return f'{delta * 1e9:.2f} ns'
12
+ elif delta < 1e-3:
13
+ return f'{delta * 1e6:.2f} us'
14
+ elif delta < 1:
15
+ return f'{delta * 1e3:.2f} ms'
16
+ else:
17
+ return f'{delta:.2f} s'
@@ -0,0 +1,98 @@
1
+ import logging
2
+ from typing import Any, Dict, Iterable, Iterator, Optional
3
+
4
+ import pyarrow as pa
5
+
6
+ import pixeltable.type_system as ts
7
+
8
+ _logger = logging.getLogger(__name__)
9
+
10
+ _pa_to_pt: Dict[pa.DataType, ts.ColumnType] = {
11
+ pa.string(): ts.StringType(nullable=True),
12
+ pa.timestamp('us'): ts.TimestampType(nullable=True),
13
+ pa.bool_(): ts.BoolType(nullable=True),
14
+ pa.uint8(): ts.IntType(nullable=True),
15
+ pa.int8(): ts.IntType(nullable=True),
16
+ pa.uint32(): ts.IntType(nullable=True),
17
+ pa.uint64(): ts.IntType(nullable=True),
18
+ pa.int32(): ts.IntType(nullable=True),
19
+ pa.int64(): ts.IntType(nullable=True),
20
+ pa.float32(): ts.FloatType(nullable=True),
21
+ }
22
+
23
+ _pt_to_pa: Dict[ts.ColumnType, pa.DataType] = {
24
+ ts.StringType: pa.string(),
25
+ ts.TimestampType: pa.timestamp('us'), # postgres timestamp is microseconds
26
+ ts.BoolType: pa.bool_(),
27
+ ts.IntType: pa.int64(),
28
+ ts.FloatType: pa.float32(),
29
+ ts.JsonType: pa.string(), # TODO(orm) pa.struct() is possible
30
+ ts.ImageType: pa.binary(), # inline image
31
+ ts.AudioType: pa.string(), # path
32
+ ts.VideoType: pa.string(), # path
33
+ ts.DocumentType: pa.string(), # path
34
+ }
35
+
36
+
37
+ def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
38
+ """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
39
+ Returns None if no conversion is currently implemented.
40
+ """
41
+ if arrow_type in _pa_to_pt:
42
+ return _pa_to_pt[arrow_type]
43
+ elif isinstance(arrow_type, pa.FixedShapeTensorType):
44
+ dtype = to_pixeltable_type(arrow_type.value_type)
45
+ if dtype is None:
46
+ return None
47
+ return ts.ArrayType(shape=arrow_type.shape, dtype=dtype)
48
+ else:
49
+ return None
50
+
51
+
52
+ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
53
+ """Convert a pixeltable DataType to a pyarrow datatype if one is defined.
54
+ Returns None if no conversion is currently implemented.
55
+ """
56
+ if pixeltable_type.__class__ in _pt_to_pa:
57
+ return _pt_to_pa[pixeltable_type.__class__]
58
+ elif isinstance(pixeltable_type, ts.ArrayType):
59
+ return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.numpy_dtype()), pixeltable_type.shape)
60
+ else:
61
+ return None
62
+
63
+
64
+ def to_pixeltable_schema(arrow_schema: pa.Schema) -> Dict[str, ts.ColumnType]:
65
+ return {field.name: to_pixeltable_type(field.type) for field in arrow_schema}
66
+
67
+
68
+ def to_arrow_schema(pixeltable_schema: Dict[str, Any]) -> pa.Schema:
69
+ return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
70
+
71
+
72
+ def to_pydict(batch: pa.RecordBatch) -> Dict[str, Iterable[Any]]:
73
+ """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
74
+ this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.
75
+ """
76
+ out = {}
77
+ for k, name in enumerate(batch.schema.names):
78
+ col = batch.column(k)
79
+ if isinstance(col.type, pa.FixedShapeTensorType):
80
+ # treat array columns as numpy arrays to easily preserve numpy type
81
+ out[name] = col.to_numpy(zero_copy_only=False)
82
+ else:
83
+ # for the rest, use pydict to preserve python types
84
+ out[name] = col.to_pylist()
85
+
86
+ return out
87
+
88
+
89
+ def iter_tuples(batch: pa.RecordBatch) -> Iterator[Dict[str, Any]]:
90
+ """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
91
+ pydict = to_pydict(batch)
92
+ assert len(pydict) > 0, 'empty record batch'
93
+ for _, v in pydict.items():
94
+ batch_size = len(v)
95
+ break
96
+
97
+ for i in range(batch_size):
98
+ yield {col_name: values[i] for col_name, values in pydict.items()}
@@ -0,0 +1,18 @@
1
+ import numpy as np
2
+ import PIL.Image
3
+
4
+ import pixeltable.func as func
5
+ from pixeltable.env import Env
6
+
7
+
8
+ def embed_image(img: PIL.Image.Image) -> np.ndarray:
9
+ from pixeltable.functions.nos.image_embedding import openai_clip
10
+ model_info = openai_clip.model_spec
11
+ result = Env.get().nos_client.Run(task=model_info.task, model_name=model_info.name, images=[img.resize((224, 224))])
12
+ return result['embedding'].squeeze(0)
13
+
14
+ def embed_text(text: str) -> np.ndarray:
15
+ from pixeltable.functions.nos.text_embedding import openai_clip
16
+ model_info = openai_clip.model_spec
17
+ result = Env.get().nos_client.Run(task=model_info.task, model_name=model_info.name, texts=[text])
18
+ return result['embedding'].squeeze(0)
@@ -0,0 +1,136 @@
1
+ from typing import List, Dict, Any, Set
2
+ from pathlib import Path
3
+ import json
4
+
5
+ import PIL
6
+
7
+ import pixeltable.exceptions as excs
8
+
9
+
10
+ format_msg = """
11
+
12
+ Required format:
13
+ {
14
+ 'image': PIL.Image.Image,
15
+ 'annotations': [
16
+ {
17
+ 'bbox': [x: int, y: int, w: int, h: int],
18
+ 'category': str | int,
19
+ },
20
+ ...
21
+ ],
22
+ }
23
+ """
24
+
25
+ def _verify_input_dict(input_dict: Dict[str, Any]) -> None:
26
+ """Verify that input_dict is a valid input dict for write_coco_dataset()"""
27
+ if not isinstance(input_dict, dict):
28
+ raise excs.Error(f'Expected dict, got {input_dict}{format_msg}')
29
+ if 'image' not in input_dict:
30
+ raise excs.Error(f'Missing key "image" in input dict: {input_dict}{format_msg}')
31
+ if not isinstance(input_dict['image'], PIL.Image.Image):
32
+ raise excs.Error(f'Value for "image" is not a PIL.Image.Image: {input_dict}{format_msg}')
33
+ if 'annotations' not in input_dict:
34
+ raise excs.Error(f'Missing key "annotations" in input dict: {input_dict}{format_msg}')
35
+ if not isinstance(input_dict['annotations'], list):
36
+ raise excs.Error(f'Value for "annotations" is not a list: {input_dict}{format_msg}')
37
+ for annotation in input_dict['annotations']:
38
+ if not isinstance(annotation, dict):
39
+ raise excs.Error(f'Annotation is not a dict: {annotation}{format_msg}')
40
+ if 'bbox' not in annotation:
41
+ raise excs.Error(f'Missing key "bbox" in annotation: {annotation}{format_msg}')
42
+ if not isinstance(annotation['bbox'], list):
43
+ raise excs.Error(f'Value for "bbox" is not a list [x, y, w, h]: {annotation}{format_msg}')
44
+ if len(annotation['bbox']) != 4 or not all(isinstance(x, int) for x in annotation['bbox']):
45
+ raise excs.Error(f'Key "bbox" is not a list [x, y, w, h] of ints: {annotation}{format_msg}')
46
+ if 'category' not in annotation:
47
+ raise excs.Error(f'Missing key "category" in annotation: {annotation}{format_msg}')
48
+ if not isinstance(annotation['category'], (str, int)):
49
+ raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
50
+
51
+ def write_coco_dataset(df: 'pixeltable.DataFrame', dest_path: Path) -> Path:
52
+ """Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
53
+ # TODO: validate schema
54
+ if len(df._select_list_exprs) != 1 or not df._select_list_exprs[0].col_type.is_json_type():
55
+ raise excs.Error(f'Expected exactly one json-typed column in select list: {df._select_list_exprs}')
56
+ input_dict_slot_idx = -1 # df._select_list_exprs[0].slot_idx isn't valid until _exec()
57
+
58
+ # create output dir
59
+ assert not dest_path.exists()
60
+ dest_path.mkdir(parents=False)
61
+ images_dir = dest_path / 'images'
62
+ images_dir.mkdir()
63
+
64
+ images: List[Dict[str, Any]] = []
65
+ img_id = -1
66
+ annotations: List[Dict[str, Any]] = []
67
+ ann_id = -1
68
+ categories: Set[Any] = set()
69
+ for input_row in df._exec():
70
+ if input_dict_slot_idx == -1:
71
+ input_dict_expr = df._select_list_exprs[0]
72
+ input_dict_slot_idx = input_dict_expr.slot_idx
73
+ input_dict = input_row[input_dict_slot_idx]
74
+ _verify_input_dict(input_dict)
75
+
76
+ # we want to know the slot idx of the image used in the input dict, so that we can check whether we
77
+ # already have a local path for it
78
+ input_dict_dependencies = input_dict_expr.dependencies()
79
+ img_slot_idx = next((e.slot_idx for e in input_dict_dependencies if e.col_type.is_image_type()), None)
80
+ assert img_slot_idx is not None
81
+ else:
82
+ input_dict = input_row[input_dict_slot_idx]
83
+ _verify_input_dict(input_dict)
84
+
85
+ # create image record
86
+ img_id += 1
87
+
88
+ # get a local path for the image
89
+ img = input_dict['image']
90
+ if input_row.file_paths[img_slot_idx] is not None:
91
+ # we already have a local path
92
+ img_path = Path(input_row.file_paths[img_slot_idx])
93
+ # TODO: if the path leads to our tmp dir, we need to move the file
94
+ else:
95
+ # we need to create a local path
96
+ img_path = images_dir / f'{img_id}.jpg'
97
+ img.save(img_path)
98
+
99
+ images.append({
100
+ 'id': img_id,
101
+ 'file_name': str(img_path),
102
+ 'width': img.width,
103
+ 'height': img.height,
104
+ })
105
+
106
+ # create annotation records for this image
107
+ for annotation in input_dict['annotations']:
108
+ ann_id += 1
109
+ x, y, w, h = annotation['bbox']
110
+ category = annotation['category']
111
+ categories.add(category)
112
+ annotations.append({
113
+ 'id': ann_id,
114
+ 'image_id': img_id,
115
+ # we use the category name here and fix it up at the end, when we have assigned category ids
116
+ 'category_id': category,
117
+ 'bbox': annotation['bbox'],
118
+ 'area': w * h,
119
+ 'iscrowd': 0,
120
+ })
121
+
122
+ # replace category names with ids
123
+ category_ids = {category: id for id, category in enumerate(sorted(list(categories)))}
124
+ for annotation in annotations:
125
+ annotation['category_id'] = category_ids[annotation['category_id']]
126
+
127
+ result = {
128
+ 'images': images,
129
+ 'annotations': annotations,
130
+ 'categories': [{'id': id, 'name': category} for category, id in category_ids.items()],
131
+ }
132
+ output_path = dest_path / 'data.json'
133
+ with open(output_path, 'w') as f:
134
+ json.dump(result, f)
135
+ return output_path
136
+
@@ -0,0 +1,69 @@
1
+ from typing import Optional, Dict
2
+ import dataclasses
3
+
4
+ import pixeltable.type_system as ts
5
+
6
+
7
+ @dataclasses.dataclass
8
+ class DocumentHandle:
9
+ format: ts.DocumentType.DocumentFormat
10
+ bs_doc: Optional['bs4.BeautifulSoup'] = None
11
+ md_ast: Optional[Dict] = None
12
+ pdf_doc: Optional['fitz.Document'] = None
13
+
14
+ def get_document_handle(path: str) -> Optional[DocumentHandle]:
15
+ # try pdf first, because a correct PDF is a binary format that
16
+ # would trigger encoding exceptions if oppened as utf8.
17
+ pdf_doc = get_pdf_handle(path)
18
+ if pdf_doc is not None:
19
+ return DocumentHandle(format=ts.DocumentType.DocumentFormat.PDF, pdf_doc=pdf_doc)
20
+ # currently the rest of the types are text-based, so we can open them in utf8 mode once
21
+ try:
22
+ with open(path, 'r', encoding='utf8') as file:
23
+ contents = file.read()
24
+ except UnicodeDecodeError:
25
+ # not pdf, and also not valid text file
26
+ return None
27
+
28
+ # bs4 will appear to succeed for md files as well.
29
+ # this will break most markdown files at the moment.
30
+ bs_doc = get_html_handle(contents)
31
+ if bs_doc is not None:
32
+ return DocumentHandle(format=ts.DocumentType.DocumentFormat.HTML, bs_doc=bs_doc)
33
+
34
+ md_ast = get_markdown_handle(contents)
35
+ if md_ast is not None:
36
+ return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
37
+
38
+ return None
39
+
40
+ def get_html_handle(text: str) -> Optional['bs4.BeautifulSoup']:
41
+ import bs4
42
+ try:
43
+ doc = bs4.BeautifulSoup(text, 'html.parser')
44
+ if doc.find() is None:
45
+ return None
46
+ return doc
47
+ except Exception:
48
+ return None
49
+
50
+ def get_markdown_handle(text: str) -> Optional[Dict]:
51
+ import mistune
52
+ try:
53
+ md_ast = mistune.create_markdown(renderer=None)
54
+ return md_ast(text)
55
+ except Exception:
56
+ return None
57
+
58
+ def get_pdf_handle(path : str) -> Optional['fitz.Document']:
59
+ import fitz # aka pymupdf
60
+ try:
61
+ doc = fitz.open(path)
62
+ # check pdf (bc it will work for images)
63
+ if not doc.is_pdf:
64
+ return None
65
+ # try to read one page
66
+ next(page for page in doc)
67
+ return doc
68
+ except Exception:
69
+ return None
@@ -0,0 +1,195 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, List, Tuple, Dict
3
+ from collections import OrderedDict, defaultdict, namedtuple
4
+ import os
5
+ import glob
6
+ from pathlib import Path
7
+ from time import time
8
+ import logging
9
+ from uuid import UUID
10
+ import hashlib
11
+
12
+ from pixeltable.env import Env
13
+
14
+
15
+ _logger = logging.getLogger('pixeltable')
16
+
17
+ class CacheEntry:
18
+ def __init__(self, key: str, tbl_id: UUID, col_id: int, size: int, last_accessed_ts: int, ext: str):
19
+ self.key = key
20
+ self.tbl_id = tbl_id
21
+ self.col_id = col_id
22
+ self.size = size
23
+ self.last_accessed_ts = last_accessed_ts
24
+ self.ext = ext
25
+
26
+ def path(self) -> Path:
27
+ return Env.get().file_cache_dir / f'{self.tbl_id.hex}_{self.col_id}_{self.key}{self.ext}'
28
+
29
+ @classmethod
30
+ def from_file(cls, path: Path) -> CacheEntry:
31
+ components = path.stem.split('_')
32
+ assert len(components) == 3
33
+ tbl_id = UUID(components[0])
34
+ col_id = int(components[1])
35
+ key = components[2]
36
+ file_info = os.stat(str(path))
37
+ return cls(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
38
+
39
+
40
+ class FileCache:
41
+ """
42
+ A local cache of external (eg, S3) file references in cells of a stored table (ie, table or view).
43
+
44
+ Cache entries are identified by a hash of the file url and stored in Env.filecache_dir. The time of last
45
+ access of a cache entries is its file's mtime.
46
+
47
+ TODO:
48
+ - enforce a maximum capacity with LRU eviction
49
+ - implement MRU eviction for queries that exceed the capacity
50
+ """
51
+ _instance: Optional[FileCache] = None
52
+ ColumnStats = namedtuple('FileCacheColumnStats', ['tbl_id', 'col_id', 'num_files', 'total_size'])
53
+ CacheStats = namedtuple(
54
+ 'FileCacheStats', ['total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats'])
55
+
56
+ @classmethod
57
+ def get(cls) -> FileCache:
58
+ if cls._instance is None:
59
+ cls._instance = cls()
60
+ return cls._instance
61
+
62
+ def __init__(self):
63
+ self.cache: OrderedDict[str, CacheEntry] = OrderedDict() # ordered by entry.last_accessed_ts
64
+ self.total_size = 0
65
+ #self.capacity = Env.get().max_filecache_size
66
+ self.num_requests = 0
67
+ self.num_hits = 0
68
+ self.num_evictions = 0
69
+ paths = glob.glob(str(Env.get().file_cache_dir / '*'))
70
+ entries = [CacheEntry.from_file(Path(path_str)) for path_str in paths]
71
+ # we need to insert entries in order of last_accessed_ts
72
+ entries.sort(key=lambda e: e.last_accessed_ts)
73
+ for entry in entries:
74
+ self.cache[entry.key] = entry
75
+ self.total_size += entry.size
76
+
77
+ def avg_file_size(self) -> int:
78
+ if len(self.cache) == 0:
79
+ return 0
80
+ return int(self.total_size / len(self.cache))
81
+
82
+ def num_files(self, tbl_id: Optional[UUID] = None) -> int:
83
+ if tbl_id is None:
84
+ return len(self.cache)
85
+ entries = [e for e in self.cache.values() if e.tbl_id == tbl_id]
86
+ return len(entries)
87
+
88
+ def clear(self, tbl_id: Optional[UUID] = None, capacity: Optional[int] = None) -> None:
89
+ """
90
+ For testing purposes: allow resetting capacity and stats.
91
+ """
92
+ self.num_requests, self.num_hits, self.num_evictions = 0, 0, 0
93
+ entries = list(self.cache.values()) # list(): avoid dealing with values() return type
94
+ if tbl_id is not None:
95
+ entries = [e for e in entries if e.tbl_id == tbl_id]
96
+ _logger.debug(f'clearing {len(entries)} entries from file cache for table {tbl_id}')
97
+ else:
98
+ _logger.debug(f'clearing {len(entries)} entries from file cache')
99
+ for entry in entries:
100
+ del self.cache[entry.key]
101
+ self.total_size -= entry.size
102
+ os.remove(entry.path())
103
+ # if capacity is not None:
104
+ # self.capacity = capacity
105
+ # else:
106
+ # # need to reset to default
107
+ # self.capacity = Env.get().max_filecache_size
108
+ # _logger.debug(f'setting file cache capacity to {self.capacity}')
109
+
110
+ def _url_hash(self, url: str) -> str:
111
+ h = hashlib.sha256()
112
+ h.update(url.encode())
113
+ return h.hexdigest()
114
+
115
+ def lookup(self, url: str) -> Optional[Path]:
116
+ self.num_requests += 1
117
+ key = self._url_hash(url)
118
+ entry = self.cache.get(key, None)
119
+ if entry is None:
120
+ _logger.debug(f'file cache miss for {url}')
121
+ return None
122
+ # update mtime and cache
123
+ path = entry.path()
124
+ path.touch(exist_ok=True)
125
+ file_info = os.stat(str(path))
126
+ entry.last_accessed_ts = file_info.st_mtime
127
+ self.cache.move_to_end(key, last=True)
128
+ self.num_hits += 1
129
+ _logger.debug(f'file cache hit for {url}')
130
+ return path
131
+
132
+ # def can_admit(self, query_ts: int) -> bool:
133
+ # if self.total_size + self.avg_file_size <= self.capacity:
134
+ # return True
135
+ # assert len(self.cache) > 0
136
+ # # check whether we can evict the current lru entry
137
+ # lru_entry = next(iter(self.cache.values()))
138
+ # if lru_entry.last_accessed_ts >= query_ts:
139
+ # # the current query brought this entry in: we're not going to evict it
140
+ # return False
141
+ # return True
142
+
143
+ def add(self, tbl_id: UUID, col_id: int, url: str, path: Path) -> Path:
144
+ """Adds url at 'path' to cache and returns its new path.
145
+ 'path' will not be accessible after this call. Retains the extension of 'path'.
146
+ """
147
+ file_info = os.stat(str(path))
148
+ _ = time()
149
+ #if self.total_size + file_info.st_size > self.capacity:
150
+ if False:
151
+ if len(self.cache) == 0:
152
+ # nothing to evict
153
+ return
154
+ # evict entries until we're below the limit or until we run into entries the current query brought in
155
+ while True:
156
+ lru_entry = next(iter(self.cache.values()))
157
+ if lru_entry.last_accessed_ts >= query_ts:
158
+ # the current query brought this entry in: switch to MRU and ignore this put()
159
+ _logger.debug('file cache switched to MRU')
160
+ return
161
+ self.cache.popitem(last=False)
162
+ self.total_size -= lru_entry.size
163
+ self.num_evictions += 1
164
+ os.remove(str(lru_entry.path()))
165
+ _logger.debug(f'evicted entry for cell {lru_entry.cell_id} from file cache')
166
+ if self.total_size + file_info.st_size <= self.capacity:
167
+ break
168
+
169
+ key = self._url_hash(url)
170
+ assert key not in self.cache
171
+ entry = CacheEntry(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
172
+ self.cache[key] = entry
173
+ self.total_size += entry.size
174
+ new_path = entry.path()
175
+ os.rename(str(path), str(new_path))
176
+ _logger.debug(f'added entry for cell {url} to file cache')
177
+ return new_path
178
+
179
+ def stats(self) -> CacheStats:
180
+ # collect column stats
181
+ # (tbl_id, col_id) -> (num_files, total_size)
182
+ d: Dict[Tuple[int, int], List[int]] = defaultdict(lambda: [0, 0])
183
+ for entry in self.cache.values():
184
+ t = d[(entry.tbl_id, entry.col_id)]
185
+ t[0] += 1
186
+ t[1] += entry.size
187
+ col_stats = [
188
+ self.ColumnStats(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()
189
+ ]
190
+ col_stats.sort(key=lambda e: e[3], reverse=True)
191
+ return self.CacheStats(self.total_size, self.num_requests, self.num_hits, self.num_evictions, col_stats)
192
+
193
+ def debug_print(self) -> None:
194
+ for entry in self.cache.values():
195
+ print(f'CacheEntry: tbl_id={entry.tbl_id}, col_id={entry.col_id}, size={entry.size}')
@@ -0,0 +1,11 @@
1
+ from typing import Any
2
+
3
+ import pixeltable.func as func
4
+
5
+
6
+ def help(obj: Any) -> None:
7
+ """Returns help text for the given object."""
8
+ if isinstance(obj, func.Function):
9
+ print(obj.help_str())
10
+ else:
11
+ print(__builtins__.help(obj))
@@ -0,0 +1,70 @@
1
+ import http
2
+ import http.server
3
+ import logging
4
+ import urllib
5
+ import posixpath
6
+ import pathlib
7
+ import os
8
+ import string
9
+
10
+ _logger = logging.getLogger('pixeltable.http.server')
11
+
12
+
13
+ def get_file_uri(http_address: str, file_path: str) -> str:
14
+ """Get the URI for a file path, with the given prefix.
15
+ Used in the client to generate a URI
16
+ """
17
+ abs_path = pathlib.Path(file_path)
18
+ assert abs_path.is_absolute()
19
+ url = urllib.request.pathname2url(str(abs_path))
20
+ return f'{http_address}{url}'
21
+
22
+
23
+ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
24
+ """Serves all absolute paths, not just the current directory"""
25
+ def translate_path(self, path: str) -> str:
26
+ """
27
+ Translate a /-separated PATH to the local filename syntax.
28
+ overrides http.server.SimpleHTTPRequestHandler.translate_path
29
+
30
+ This is only useful for file serving.
31
+
32
+ Code initially taken from there:
33
+ https://github.com/python/cpython/blob/f5406ef454662b98df107775d18ff71ae6849618/Lib/http/server.py#L834
34
+ """
35
+ _logger.info(f'translate path {path=}')
36
+ # abandon query parameters, taken from http.server.SimpleHTTPRequestHandler
37
+ path = path.split('?', 1)[0]
38
+ path = path.split('#', 1)[0]
39
+
40
+ path = pathlib.Path(urllib.request.url2pathname(path))
41
+ return str(path)
42
+
43
+ def log_message(self, format, *args) -> None:
44
+ """override logging to stderr in http.server.BaseHTTPRequestHandler"""
45
+ message = format % args
46
+ _logger.info(message.translate(self._control_char_table))
47
+
48
+
49
+ class LoggingHTTPServer(http.server.ThreadingHTTPServer):
50
+ """Avoids polluting stdout and stderr"""
51
+
52
+ def handle_error(self, request, client_address) -> None:
53
+ """override socketserver.TCPServer.handle_error which prints directly to sys.stderr"""
54
+ import traceback
55
+
56
+ _logger.error(
57
+ f'Exception occurred during processing of {request=} from {client_address=}\
58
+ \nbacktrace:\n{traceback.format_exc()}\n----\n'
59
+ )
60
+
61
+
62
+ def make_server(address: str, port: int) -> http.server.HTTPServer:
63
+ """Create a file server with pixeltable specific config """
64
+ return LoggingHTTPServer((address, port), AbsolutePathHandler)
65
+
66
+
67
+ if __name__ == '__main__':
68
+ httpd = make_server('127.0.0.1', 8000)
69
+ print(f'about to server HTTP on {httpd.server_address}')
70
+ httpd.serve_forever()