pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,35 +1,12 @@
1
1
  import logging
2
- import sys
3
- from typing import Any, Callable, Optional, TypeVar
2
+ from typing import Any, Callable, TypeVar
4
3
 
5
4
  R = TypeVar('R')
6
5
 
7
-
8
- def _is_in_exception() -> bool:
9
- """
10
- Check if code is currently executing within an exception context.
11
- """
12
- current_exception = sys.exc_info()[1]
13
- return current_exception is not None
14
-
15
-
16
- def run_cleanup_on_exception(cleanup_func: Callable[..., R], *args: Any, **kwargs: Any) -> Optional[R]:
17
- """
18
- Runs cleanup only when running in exception context.
19
-
20
- The function `run_cleanup_on_exception()` should be used to clean up resources when an operation fails.
21
- This is typically done using a try, except, and finally block, with the resource cleanup logic placed within
22
- the except block. However, this pattern may not handle KeyboardInterrupt exceptions.
23
- To ensure that resources are always cleaned up at least once when an exception or KeyboardInterrupt occurs,
24
- create an idempotent function for cleaning up resources and pass it to the `run_cleanup_on_exception()` function
25
- from the finally block.
26
- """
27
- if _is_in_exception():
28
- return run_cleanup(cleanup_func, *args, raise_error=False, **kwargs)
29
- return None
6
+ logger = logging.getLogger('pixeltable')
30
7
 
31
8
 
32
- def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> Optional[R]:
9
+ def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> R | None:
33
10
  """
34
11
  Runs a cleanup function. If interrupted, retry cleanup.
35
12
  The `run_cleanup()` function ensures that the `cleanup_func()` function executes at least once.
@@ -40,20 +17,20 @@ def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool =
40
17
  raise_error: raise an exception if an error occurs during cleanup.
41
18
  """
42
19
  try:
43
- logging.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
20
+ logger.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
44
21
  return cleanup_func(*args, **kwargs)
45
22
  except KeyboardInterrupt as interrupt:
46
23
  # Save original exception and re-attempt cleanup
47
24
  original_exception = interrupt
48
- logging.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
25
+ logger.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
49
26
  try:
50
27
  return cleanup_func(*args, **kwargs)
51
28
  except Exception as e:
52
29
  # Suppress this exception
53
- logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
30
+ logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
54
31
  raise KeyboardInterrupt from original_exception
55
32
  except Exception as e:
56
- logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
33
+ logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
57
34
  if raise_error:
58
35
  raise e
59
36
  return None
@@ -9,7 +9,7 @@ from collections import OrderedDict, defaultdict
9
9
  from dataclasses import dataclass
10
10
  from datetime import datetime, timezone
11
11
  from pathlib import Path
12
- from typing import NamedTuple, Optional
12
+ from typing import NamedTuple
13
13
  from uuid import UUID
14
14
 
15
15
  import pixeltable.exceptions as excs
@@ -58,7 +58,7 @@ class FileCache:
58
58
  - implement MRU eviction for queries that exceed the capacity
59
59
  """
60
60
 
61
- __instance: Optional[FileCache] = None
61
+ __instance: FileCache | None = None
62
62
 
63
63
  cache: OrderedDict[str, CacheEntry]
64
64
  total_size: int
@@ -126,12 +126,12 @@ class FileCache:
126
126
  return 0
127
127
  return int(self.total_size / len(self.cache))
128
128
 
129
- def num_files(self, tbl_id: Optional[UUID] = None) -> int:
129
+ def num_files(self, tbl_id: UUID | None = None) -> int:
130
130
  if tbl_id is None:
131
131
  return len(self.cache)
132
132
  return sum(e.tbl_id == tbl_id for e in self.cache.values())
133
133
 
134
- def clear(self, tbl_id: Optional[UUID] = None) -> None:
134
+ def clear(self, tbl_id: UUID | None = None) -> None:
135
135
  """
136
136
  For testing purposes: allow resetting capacity and stats.
137
137
  """
@@ -174,7 +174,7 @@ class FileCache:
174
174
  h.update(url.encode())
175
175
  return h.hexdigest()
176
176
 
177
- def lookup(self, url: str) -> Optional[Path]:
177
+ def lookup(self, url: str) -> Path | None:
178
178
  self.num_requests += 1
179
179
  key = self._url_hash(url)
180
180
  entry = self.cache.get(key, None)
@@ -214,7 +214,7 @@ class FileCache:
214
214
  new_path = entry.path
215
215
  os.rename(str(path), str(new_path))
216
216
  new_path.touch(exist_ok=True)
217
- _logger.debug(f'added entry for cell {url} to file cache')
217
+ _logger.debug(f'FileCache: cached url {url} with file name {new_path}')
218
218
  return new_path
219
219
 
220
220
  def ensure_capacity(self, size: int) -> None:
@@ -4,11 +4,13 @@ import io
4
4
  import json
5
5
  import logging
6
6
  import mimetypes
7
- from typing import Any, Callable, Optional
7
+ import uuid
8
+ from typing import Any, Callable
8
9
 
9
10
  import av
10
11
  import numpy as np
11
12
  from PIL import Image
13
+ from pypdfium2 import PdfDocument # type: ignore[import-untyped]
12
14
 
13
15
  import pixeltable.type_system as ts
14
16
  from pixeltable.utils.http_server import get_file_uri
@@ -19,11 +21,11 @@ _logger = logging.getLogger('pixeltable')
19
21
  class Formatter:
20
22
  """
21
23
  A factory for constructing HTML formatters for Pixeltable data. The formatters are used to customize
22
- the rendering of `DataFrameResultSet`s in notebooks.
24
+ the rendering of `ResultSet`s in notebooks.
23
25
 
24
26
  Args:
25
- num_rows: Number of rows in the DataFrame being rendered.
26
- num_cols: Number of columns in the DataFrame being rendered.
27
+ num_rows: Number of rows in the `ResultSet` being rendered.
28
+ num_cols: Number of columns in the `ResultSet` being rendered.
27
29
  http_address: Root address of the Pixeltable HTTP server (used to construct URLs for media references).
28
30
  """
29
31
 
@@ -39,9 +41,13 @@ class Formatter:
39
41
  self.__num_cols = num_cols
40
42
  self.__http_address = http_address
41
43
 
42
- def get_pandas_formatter(self, col_type: ts.ColumnType) -> Optional[Callable]:
44
+ def get_pandas_formatter(self, col_type: ts.ColumnType) -> Callable | None:
43
45
  if col_type.is_string_type():
44
46
  return self.format_string
47
+ if col_type.is_uuid_type():
48
+ return self.format_uuid
49
+ if col_type.is_binary_type():
50
+ return self.format_binary
45
51
  if col_type.is_float_type():
46
52
  return self.format_float
47
53
  if col_type.is_json_type():
@@ -63,10 +69,24 @@ class Formatter:
63
69
  """
64
70
  Escapes special characters in `val`, and abbreviates `val` if its length exceeds `_STRING_MAX_LEN`.
65
71
  """
66
- return cls.__escape(cls.__abbreviate(val, cls.__STRING_MAX_LEN))
72
+ return cls.__escape(cls.abbreviate(val))
67
73
 
68
74
  @classmethod
69
- def __abbreviate(cls, val: str, max_len: int) -> str:
75
+ def format_uuid(cls, val: uuid.UUID | None) -> str:
76
+ """
77
+ Formats a UUID by converting it to a string and applying string formatting.
78
+ """
79
+ return '' if val is None else cls.format_string(str(val))
80
+
81
+ @classmethod
82
+ def format_binary(cls, val: bytes) -> str:
83
+ """
84
+ Formats binary data by converting it to an encoded string and applying string formatting.
85
+ """
86
+ return cls.format_string(str(val))
87
+
88
+ @classmethod
89
+ def abbreviate(cls, val: str, max_len: int = __STRING_MAX_LEN) -> str:
70
90
  if len(val) > max_len:
71
91
  edgeitems = (max_len - len(cls.__STRING_SEP)) // 2
72
92
  return f'{val[:edgeitems]}{cls.__STRING_SEP}{val[-edgeitems:]}'
@@ -94,41 +114,45 @@ class Formatter:
94
114
  )
95
115
 
96
116
  @classmethod
97
- def format_json(cls, val: Any) -> str:
117
+ def format_json(cls, val: Any, escape_strings: bool = True) -> str:
98
118
  if isinstance(val, str):
99
119
  # JSON-like formatting will be applied to strings that appear nested within a list or dict
100
120
  # (quote the string; escape any quotes inside the string; shorter abbreviations).
101
121
  # However, if the string appears in top-level position (i.e., the entire JSON value is a
102
122
  # string), then we format it like an ordinary string.
103
- return cls.format_string(val)
123
+ return cls.format_string(val) if escape_strings else cls.abbreviate(val)
104
124
  # In all other cases, dump the JSON struct recursively.
105
- return cls.__format_json_rec(val)
125
+ return cls.__format_json_rec(val, escape_strings)
106
126
 
107
127
  @classmethod
108
- def __format_json_rec(cls, val: Any) -> str:
128
+ def __format_json_rec(cls, val: Any, escape_strings: bool) -> str:
109
129
  if isinstance(val, str):
110
- return cls.__escape(json.dumps(cls.__abbreviate(val, cls.__NESTED_STRING_MAX_LEN)))
130
+ formatted = json.dumps(cls.abbreviate(val, cls.__NESTED_STRING_MAX_LEN))
131
+ return cls.__escape(formatted) if escape_strings else formatted
111
132
  if isinstance(val, float):
112
133
  return cls.format_float(val)
113
134
  if isinstance(val, np.ndarray):
114
135
  return cls.format_array(val)
115
136
  if isinstance(val, list):
116
137
  if len(val) < cls.__LIST_THRESHOLD:
117
- components = [cls.__format_json_rec(x) for x in val]
138
+ components = [cls.__format_json_rec(x, escape_strings) for x in val]
118
139
  else:
119
- components = [cls.__format_json_rec(x) for x in val[: cls.__LIST_EDGEITEMS]]
140
+ components = [cls.__format_json_rec(x, escape_strings) for x in val[: cls.__LIST_EDGEITEMS]]
120
141
  components.append('...')
121
- components.extend(cls.__format_json_rec(x) for x in val[-cls.__LIST_EDGEITEMS :])
142
+ components.extend(cls.__format_json_rec(x, escape_strings) for x in val[-cls.__LIST_EDGEITEMS :])
122
143
  return '[' + ', '.join(components) + ']'
123
144
  if isinstance(val, dict):
124
- kv_pairs = (f'{cls.__format_json_rec(k)}: {cls.__format_json_rec(v)}' for k, v in val.items())
145
+ kv_pairs = (
146
+ f'{cls.__format_json_rec(k, escape_strings)}: {cls.__format_json_rec(v, escape_strings)}'
147
+ for k, v in val.items()
148
+ )
125
149
  return '{' + ', '.join(kv_pairs) + '}'
126
150
 
127
151
  # Everything else
128
152
  try:
129
153
  return json.dumps(val)
130
154
  except TypeError: # Not JSON serializable
131
- return str(val)
155
+ return cls.__escape(str(val))
132
156
 
133
157
  def format_img(self, img: Image.Image) -> str:
134
158
  """
@@ -152,22 +176,19 @@ class Formatter:
152
176
  """
153
177
 
154
178
  def format_video(self, file_path: str) -> str:
155
- thumb_tag = ''
156
179
  # Attempt to extract the first frame of the video to use as a thumbnail,
157
180
  # so that the notebook can be exported as HTML and viewed in contexts where
158
181
  # the video itself is not accessible.
159
182
  # TODO(aaron-siegel): If the video is backed by a concrete external URL,
160
183
  # should we link to that instead?
161
- with av.open(file_path) as container:
162
- try:
163
- thumb = next(container.decode(video=0)).to_image()
164
- assert isinstance(thumb, Image.Image)
165
- with io.BytesIO() as buffer:
166
- thumb.save(buffer, 'jpeg')
167
- thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
168
- thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
169
- except Exception:
170
- pass
184
+ thumb = self.extract_first_video_frame(file_path)
185
+ if thumb is None:
186
+ thumb_tag = ''
187
+ else:
188
+ with io.BytesIO() as buffer:
189
+ thumb.save(buffer, 'jpeg')
190
+ thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
191
+ thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
171
192
  if self.__num_rows > 1:
172
193
  width = 320
173
194
  elif self.__num_cols > 1:
@@ -182,6 +203,16 @@ class Formatter:
182
203
  </div>
183
204
  """
184
205
 
206
+ @classmethod
207
+ def extract_first_video_frame(cls, file_path: str) -> Image.Image | None:
208
+ with av.open(file_path) as container:
209
+ try:
210
+ img = next(container.decode(video=0)).to_image()
211
+ assert isinstance(img, Image.Image)
212
+ return img
213
+ except Exception:
214
+ return None
215
+
185
216
  def format_audio(self, file_path: str) -> str:
186
217
  return f"""
187
218
  <div class="pxt_audio">
@@ -191,29 +222,18 @@ class Formatter:
191
222
  </div>
192
223
  """
193
224
 
194
- def format_document(self, file_path: str) -> str:
195
- max_width = max_height = 320
225
+ def format_document(self, file_path: str, max_width: int = 320, max_height: int = 320) -> str:
196
226
  # by default, file path will be shown as a link
197
227
  inner_element = file_path
198
228
  inner_element = html.escape(inner_element)
199
- # try generating a thumbnail for different types and use that if successful
200
- if file_path.lower().endswith('.pdf'):
201
- try:
202
- import fitz # type: ignore[import-untyped]
203
-
204
- doc = fitz.open(file_path)
205
- p = doc.get_page_pixmap(0)
206
- while p.width > max_width or p.height > max_height:
207
- # shrink(1) will halve each dimension
208
- p.shrink(1)
209
- data = p.tobytes(output='jpeg')
210
- thumb_base64 = base64.b64encode(data).decode()
211
- img_src = f'data:image/jpeg;base64,{thumb_base64}'
212
- inner_element = f"""
213
- <img style="object-fit: contain; border: 1px solid black;" src="{img_src}" />
214
- """
215
- except Exception:
216
- logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
229
+
230
+ thumb = self.make_document_thumbnail(file_path, max_width, max_height)
231
+ if thumb is not None:
232
+ with io.BytesIO() as buffer:
233
+ thumb.save(buffer, 'webp')
234
+ thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
235
+ thumb_tag = f'data:image/webp;base64,{thumb_base64}'
236
+ inner_element = f'<img style="object-fit: contain; border: 1px solid black;" src="{thumb_tag}" />'
217
237
 
218
238
  return f"""
219
239
  <div class="pxt_document" style="width:{max_width}px;">
@@ -223,6 +243,24 @@ class Formatter:
223
243
  </div>
224
244
  """
225
245
 
246
+ @classmethod
247
+ def make_document_thumbnail(cls, file_path: str, max_width: int = 320, max_height: int = 320) -> Image.Image | None:
248
+ """
249
+ Returns a thumbnail image of a document.
250
+ """
251
+ if file_path.lower().endswith('.pdf'):
252
+ try:
253
+ doc = PdfDocument(file_path)
254
+ if len(doc) == 0:
255
+ return None
256
+ img = doc[0].render().to_pil()
257
+ img.thumbnail((max_width, max_height), Image.LANCZOS)
258
+ return img
259
+ except Exception:
260
+ logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have pypdfium2 installed.')
261
+
262
+ return None
263
+
226
264
  @classmethod
227
265
  def __create_source_tag(cls, http_address: str, file_path: str) -> str:
228
266
  src_url = get_file_uri(http_address, file_path)
@@ -0,0 +1,295 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ import urllib.parse
6
+ import uuid
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Any, Iterator
9
+
10
+ from google.api_core.exceptions import GoogleAPIError
11
+ from google.cloud import storage # type: ignore[attr-defined]
12
+ from google.cloud.exceptions import Forbidden, NotFound
13
+ from google.cloud.storage.client import Client # type: ignore[import-untyped]
14
+
15
+ from pixeltable import env, exceptions as excs
16
+ from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress, StorageTarget
17
+
18
+ if TYPE_CHECKING:
19
+ from pixeltable.catalog import Column
20
+
21
+ _logger = logging.getLogger('pixeltable')
22
+
23
+
24
+ @env.register_client('gcs_store')
25
+ def _() -> 'Client':
26
+ """Create and return a GCS client, using default credentials if available,
27
+ otherwise creating an anonymous client for public buckets.
28
+ """
29
+ try:
30
+ # Create a client with default credentials
31
+ # Note that if the default credentials have expired, gcloud will still create a client,
32
+ # which will report the expiry error when it is used.
33
+ # To create and use an anonymous client, expired credentials must be removed.
34
+ # For application default credentials, delete the file in ~/.config/gcloud/, or
35
+ # gcloud auth application-default revoke
36
+ # OR
37
+ # For service account keys, you must delete the downloaded key file.
38
+ client = storage.Client()
39
+ return client
40
+ except Exception:
41
+ # If no credentials are found, create an anonymous client which can be used for public buckets.
42
+ client = storage.Client.create_anonymous_client()
43
+ return client
44
+
45
+
46
+ class GCSStore(ObjectStoreBase):
47
+ """Class to handle Google Cloud Storage operations."""
48
+
49
+ # URI of the GCS bucket in the format gs://bucket_name/prefix/
50
+ # Always ends with a slash
51
+ __base_uri: str
52
+
53
+ # bucket name extracted from the URI
54
+ __bucket_name: str
55
+
56
+ # prefix path within the bucket, either empty or ending with a slash
57
+ __prefix_name: str
58
+
59
+ # The parsed form of the given destination address
60
+ soa: StorageObjectAddress
61
+
62
+ def __init__(self, soa: StorageObjectAddress):
63
+ assert soa.storage_target == StorageTarget.GCS_STORE, f'Expected storage_target "gs", got {soa.storage_target}'
64
+ self.soa = soa
65
+ self.__base_uri = soa.prefix_free_uri + soa.prefix
66
+ self.__bucket_name = soa.container
67
+ self.__prefix_name = soa.prefix
68
+
69
+ @classmethod
70
+ def client(cls) -> 'Client':
71
+ """Return the GCS client."""
72
+ return env.Env.get().get_client('gcs_store')
73
+
74
+ @property
75
+ def bucket_name(self) -> str:
76
+ """Return the bucket name from the base URI."""
77
+ return self.__bucket_name
78
+
79
+ @property
80
+ def prefix(self) -> str:
81
+ """Return the prefix from the base URI."""
82
+ return self.__prefix_name
83
+
84
+ def validate(self, error_col_name: str) -> str | None:
85
+ """
86
+ Checks if the URI exists.
87
+
88
+ Returns:
89
+ str: The base URI if the GCS bucket exists and is accessible, None otherwise.
90
+ """
91
+ try:
92
+ client = self.client()
93
+ bucket = client.bucket(self.bucket_name)
94
+ blobs = bucket.list_blobs(max_results=1)
95
+ # This will raise an exception if the destination doesn't exist or cannot be listed
96
+ _ = list(blobs) # Force evaluation to check access
97
+ return self.__base_uri
98
+ except (NotFound, Forbidden, GoogleAPIError) as e:
99
+ self.handle_gcs_error(e, self.bucket_name, f'validate bucket {error_col_name}')
100
+ return None
101
+
102
+ def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext: str | None = None) -> str:
103
+ """
104
+ Construct a new, unique URI for a persisted media file.
105
+ """
106
+ prefix, filename = ObjectPath.create_prefix_raw(tbl_id, col_id, tbl_version, ext)
107
+ parent = f'{self.__base_uri}{prefix}'
108
+ return f'{parent}/{filename}'
109
+
110
+ def _prepare_uri(self, col: Column, ext: str | None = None) -> str:
111
+ """
112
+ Construct a new, unique URI for a persisted media file.
113
+ """
114
+ assert col.get_tbl() is not None, 'Column must be associated with a table'
115
+ return self._prepare_uri_raw(col.get_tbl().id, col.id, col.get_tbl().version, ext=ext)
116
+
117
+ def copy_local_file(self, col: Column, src_path: Path) -> str:
118
+ """Copy a local file, and return its new URL"""
119
+ new_file_uri = self._prepare_uri(col, ext=src_path.suffix)
120
+ parsed = urllib.parse.urlparse(new_file_uri)
121
+ blob_name = parsed.path.lstrip('/')
122
+
123
+ try:
124
+ client = self.client()
125
+ bucket = client.bucket(self.bucket_name)
126
+ blob = bucket.blob(blob_name)
127
+ blob.upload_from_filename(str(src_path))
128
+ _logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
129
+ return new_file_uri
130
+ except GoogleAPIError as e:
131
+ self.handle_gcs_error(e, self.bucket_name, f'upload file {src_path}')
132
+ raise
133
+
134
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
135
+ """Copies an object to a local file. Thread safe"""
136
+ try:
137
+ client = self.client()
138
+ bucket = client.bucket(self.bucket_name)
139
+ blob = bucket.blob(self.prefix + src_path)
140
+ blob.download_to_filename(str(dest_path))
141
+ except GoogleAPIError as e:
142
+ self.handle_gcs_error(e, self.bucket_name, f'download file {src_path}')
143
+ raise
144
+
145
+ def _get_filtered_objects(self, bucket: Any, tbl_id: uuid.UUID, tbl_version: int | None = None) -> Iterator:
146
+ """Private method to get filtered objects for a table, optionally filtered by version.
147
+
148
+ Args:
149
+ tbl_id: Table UUID to filter by
150
+ tbl_version: Optional table version to filter by
151
+
152
+ Returns:
153
+ Tuple of (iterator over GCS objects matching the criteria, bucket object)
154
+ """
155
+ table_prefix = ObjectPath.table_prefix(tbl_id)
156
+ prefix = f'{self.prefix}{table_prefix}/'
157
+
158
+ if tbl_version is None:
159
+ # Return all blobs with the table prefix
160
+ blob_iterator = bucket.list_blobs(prefix=prefix)
161
+ else:
162
+ # Filter by both table_id and table_version using the ObjectPath pattern
163
+ # Pattern: tbl_id_col_id_version_uuid
164
+ version_pattern = re.compile(rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*')
165
+ # Return filtered collection - this still uses lazy loading
166
+ all_blobs = bucket.list_blobs(prefix=prefix)
167
+ blob_iterator = (blob for blob in all_blobs if version_pattern.match(blob.name.split('/')[-1]))
168
+
169
+ return blob_iterator
170
+
171
+ def count(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
172
+ """Count the number of files belonging to tbl_id. If tbl_version is not None,
173
+ count only those files belonging to the specified tbl_version.
174
+
175
+ Args:
176
+ tbl_id: Table UUID to count objects for
177
+ tbl_version: Optional table version to filter by
178
+
179
+ Returns:
180
+ Number of objects matching the criteria
181
+ """
182
+ assert tbl_id is not None
183
+
184
+ try:
185
+ client = self.client()
186
+ bucket = client.bucket(self.bucket_name)
187
+
188
+ blob_iterator = self._get_filtered_objects(bucket, tbl_id, tbl_version)
189
+
190
+ return sum(1 for _ in blob_iterator)
191
+
192
+ except GoogleAPIError as e:
193
+ self.handle_gcs_error(e, self.bucket_name, f'setup iterator {self.prefix}')
194
+ raise
195
+
196
+ def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
197
+ """Delete all files belonging to tbl_id. If tbl_version is not None, delete
198
+ only those files belonging to the specified tbl_version.
199
+
200
+ Args:
201
+ tbl_id: Table UUID to delete objects for
202
+ tbl_version: Optional table version to filter by
203
+
204
+ Returns:
205
+ Number of objects deleted
206
+ """
207
+ assert tbl_id is not None
208
+ total_deleted = 0
209
+
210
+ try:
211
+ client = self.client()
212
+ bucket = client.bucket(self.bucket_name)
213
+ blob_iterator = self._get_filtered_objects(bucket, tbl_id, tbl_version)
214
+
215
+ # Collect blob names for batch deletion
216
+ blobs_to_delete = []
217
+
218
+ for blob in blob_iterator:
219
+ blobs_to_delete.append(blob)
220
+
221
+ # Process in batches for efficiency
222
+ if len(blobs_to_delete) >= 100:
223
+ with client.batch():
224
+ for b in blobs_to_delete:
225
+ b.delete()
226
+ total_deleted += len(blobs_to_delete)
227
+ blobs_to_delete = []
228
+
229
+ # Delete any remaining blobs in the final batch
230
+ if len(blobs_to_delete) > 0:
231
+ with client.batch():
232
+ for b in blobs_to_delete:
233
+ b.delete()
234
+ total_deleted += len(blobs_to_delete)
235
+
236
+ return total_deleted
237
+
238
+ except GoogleAPIError as e:
239
+ self.handle_gcs_error(e, self.bucket_name, f'deleting with {self.prefix}')
240
+ raise
241
+
242
+ def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
243
+ """Return a list of objects found in the specified destination bucket.
244
+ Each returned object includes the full set of prefixes.
245
+ if return_uri is True, full URI's are returned; otherwise, just the object keys.
246
+ """
247
+ p = self.soa.prefix_free_uri if return_uri else ''
248
+ gcs_client = self.client()
249
+ r: list[str] = []
250
+
251
+ try:
252
+ bucket = gcs_client.bucket(self.bucket_name)
253
+ # List blobs with the given prefix, limiting to n_max
254
+ blobs = bucket.list_blobs(prefix=self.prefix, max_results=n_max)
255
+
256
+ for blob in blobs:
257
+ r.append(f'{p}{blob.name}')
258
+ if len(r) >= n_max:
259
+ break
260
+
261
+ except GoogleAPIError as e:
262
+ self.handle_gcs_error(e, self.bucket_name, f'list objects from {self.prefix}')
263
+ return r
264
+
265
+ def create_presigned_url(self, soa: StorageObjectAddress, expiration_seconds: int) -> str:
266
+ """Create a presigned URL for downloading an object from GCS."""
267
+ if not soa.has_object:
268
+ raise excs.Error(f'StorageObjectAddress does not contain an object name: {soa}')
269
+
270
+ gcs_client = self.client()
271
+ bucket = gcs_client.bucket(soa.container)
272
+ blob = bucket.blob(soa.key)
273
+
274
+ presigned_url = blob.generate_signed_url(version='v4', expiration=expiration_seconds, method='GET')
275
+ return presigned_url
276
+
277
+ @classmethod
278
+ def handle_gcs_error(cls, e: Exception, bucket_name: str, operation: str = '', *, ignore_404: bool = False) -> None:
279
+ """Handle GCS-specific errors and convert them to appropriate exceptions"""
280
+ if isinstance(e, NotFound):
281
+ if ignore_404:
282
+ return
283
+ raise excs.Error(f'Bucket or object {bucket_name} not found during {operation}: {str(e)!r}')
284
+ elif isinstance(e, Forbidden):
285
+ raise excs.Error(f'Access denied to bucket {bucket_name} during {operation}: {str(e)!r}')
286
+ elif isinstance(e, GoogleAPIError):
287
+ # Handle other Google API errors
288
+ error_message = str(e)
289
+ if 'Precondition' in error_message:
290
+ raise excs.Error(f'Precondition failed for bucket {bucket_name} during {operation}: {error_message}')
291
+ else:
292
+ raise excs.Error(f'Error during {operation} in bucket {bucket_name}: {error_message}')
293
+ else:
294
+ # Generic error handling
295
+ raise excs.Error(f'Unexpected error during {operation} in bucket {bucket_name}: {str(e)!r}')