pixeltable 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (119) hide show
  1. pixeltable/__init__.py +53 -0
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/__init__.py +13 -0
  4. pixeltable/catalog/catalog.py +159 -0
  5. pixeltable/catalog/column.py +181 -0
  6. pixeltable/catalog/dir.py +32 -0
  7. pixeltable/catalog/globals.py +33 -0
  8. pixeltable/catalog/insertable_table.py +192 -0
  9. pixeltable/catalog/named_function.py +36 -0
  10. pixeltable/catalog/path.py +58 -0
  11. pixeltable/catalog/path_dict.py +139 -0
  12. pixeltable/catalog/schema_object.py +39 -0
  13. pixeltable/catalog/table.py +695 -0
  14. pixeltable/catalog/table_version.py +1026 -0
  15. pixeltable/catalog/table_version_path.py +133 -0
  16. pixeltable/catalog/view.py +203 -0
  17. pixeltable/dataframe.py +749 -0
  18. pixeltable/env.py +466 -0
  19. pixeltable/exceptions.py +17 -0
  20. pixeltable/exec/__init__.py +10 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +94 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +73 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +226 -0
  31. pixeltable/exprs/__init__.py +25 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +114 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +199 -0
  39. pixeltable/exprs/expr.py +594 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +382 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +96 -0
  44. pixeltable/exprs/in_predicate.py +96 -0
  45. pixeltable/exprs/inline_array.py +109 -0
  46. pixeltable/exprs/inline_dict.py +103 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +66 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +329 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/similarity_expr.py +65 -0
  56. pixeltable/exprs/type_cast.py +53 -0
  57. pixeltable/exprs/variable.py +45 -0
  58. pixeltable/ext/__init__.py +5 -0
  59. pixeltable/ext/functions/yolox.py +92 -0
  60. pixeltable/func/__init__.py +7 -0
  61. pixeltable/func/aggregate_function.py +197 -0
  62. pixeltable/func/callable_function.py +113 -0
  63. pixeltable/func/expr_template_function.py +99 -0
  64. pixeltable/func/function.py +141 -0
  65. pixeltable/func/function_registry.py +227 -0
  66. pixeltable/func/globals.py +46 -0
  67. pixeltable/func/nos_function.py +202 -0
  68. pixeltable/func/signature.py +162 -0
  69. pixeltable/func/udf.py +164 -0
  70. pixeltable/functions/__init__.py +95 -0
  71. pixeltable/functions/eval.py +215 -0
  72. pixeltable/functions/fireworks.py +34 -0
  73. pixeltable/functions/huggingface.py +167 -0
  74. pixeltable/functions/image.py +16 -0
  75. pixeltable/functions/openai.py +289 -0
  76. pixeltable/functions/pil/image.py +147 -0
  77. pixeltable/functions/string.py +13 -0
  78. pixeltable/functions/together.py +143 -0
  79. pixeltable/functions/util.py +52 -0
  80. pixeltable/functions/video.py +62 -0
  81. pixeltable/globals.py +425 -0
  82. pixeltable/index/__init__.py +2 -0
  83. pixeltable/index/base.py +51 -0
  84. pixeltable/index/embedding_index.py +168 -0
  85. pixeltable/io/__init__.py +3 -0
  86. pixeltable/io/hf_datasets.py +188 -0
  87. pixeltable/io/pandas.py +148 -0
  88. pixeltable/io/parquet.py +192 -0
  89. pixeltable/iterators/__init__.py +3 -0
  90. pixeltable/iterators/base.py +52 -0
  91. pixeltable/iterators/document.py +432 -0
  92. pixeltable/iterators/video.py +88 -0
  93. pixeltable/metadata/__init__.py +58 -0
  94. pixeltable/metadata/converters/convert_10.py +18 -0
  95. pixeltable/metadata/converters/convert_12.py +3 -0
  96. pixeltable/metadata/converters/convert_13.py +41 -0
  97. pixeltable/metadata/schema.py +234 -0
  98. pixeltable/plan.py +620 -0
  99. pixeltable/store.py +424 -0
  100. pixeltable/tool/create_test_db_dump.py +184 -0
  101. pixeltable/tool/create_test_video.py +81 -0
  102. pixeltable/type_system.py +846 -0
  103. pixeltable/utils/__init__.py +17 -0
  104. pixeltable/utils/arrow.py +98 -0
  105. pixeltable/utils/clip.py +18 -0
  106. pixeltable/utils/coco.py +136 -0
  107. pixeltable/utils/documents.py +69 -0
  108. pixeltable/utils/filecache.py +195 -0
  109. pixeltable/utils/help.py +11 -0
  110. pixeltable/utils/http_server.py +70 -0
  111. pixeltable/utils/media_store.py +76 -0
  112. pixeltable/utils/pytorch.py +91 -0
  113. pixeltable/utils/s3.py +13 -0
  114. pixeltable/utils/sql.py +17 -0
  115. pixeltable/utils/transactional_directory.py +35 -0
  116. pixeltable-0.0.0.dist-info/LICENSE +18 -0
  117. pixeltable-0.0.0.dist-info/METADATA +131 -0
  118. pixeltable-0.0.0.dist-info/RECORD +119 -0
  119. pixeltable-0.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, List, Any, Dict, Tuple
4
+
5
+ import sqlalchemy as sql
6
+
7
+ from .data_row import DataRow
8
+ from .expr import Expr
9
+ from .globals import ComparisonOperator
10
+ from .predicate import Predicate
11
+ from .row_builder import RowBuilder
12
+
13
+
14
+ class Comparison(Predicate):
15
+ def __init__(self, operator: ComparisonOperator, op1: Expr, op2: Expr):
16
+ super().__init__()
17
+ self.operator = operator
18
+ self.components = [op1, op2]
19
+ self.id = self._create_id()
20
+
21
+ def __str__(self) -> str:
22
+ return f'{self._op1} {self.operator} {self._op2}'
23
+
24
+ def _equals(self, other: Comparison) -> bool:
25
+ return self.operator == other.operator
26
+
27
+ def _id_attrs(self) -> List[Tuple[str, Any]]:
28
+ return super()._id_attrs() + [('operator', self.operator.value)]
29
+
30
+ @property
31
+ def _op1(self) -> Expr:
32
+ return self.components[0]
33
+
34
+ @property
35
+ def _op2(self) -> Expr:
36
+ return self.components[1]
37
+
38
+ def sql_expr(self) -> Optional[sql.ClauseElement]:
39
+ left = self._op1.sql_expr()
40
+ right = self._op2.sql_expr()
41
+ if left is None or right is None:
42
+ return None
43
+ if self.operator == ComparisonOperator.LT:
44
+ return left < right
45
+ if self.operator == ComparisonOperator.LE:
46
+ return left <= right
47
+ if self.operator == ComparisonOperator.EQ:
48
+ return left == right
49
+ if self.operator == ComparisonOperator.NE:
50
+ return left != right
51
+ if self.operator == ComparisonOperator.GT:
52
+ return left > right
53
+ if self.operator == ComparisonOperator.GE:
54
+ return left >= right
55
+
56
+ def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
57
+ if self.operator == ComparisonOperator.LT:
58
+ data_row[self.slot_idx] = data_row[self._op1.slot_idx] < data_row[self._op2.slot_idx]
59
+ elif self.operator == ComparisonOperator.LE:
60
+ data_row[self.slot_idx] = data_row[self._op1.slot_idx] <= data_row[self._op2.slot_idx]
61
+ elif self.operator == ComparisonOperator.EQ:
62
+ data_row[self.slot_idx] = data_row[self._op1.slot_idx] == data_row[self._op2.slot_idx]
63
+ elif self.operator == ComparisonOperator.NE:
64
+ data_row[self.slot_idx] = data_row[self._op1.slot_idx] != data_row[self._op2.slot_idx]
65
+ elif self.operator == ComparisonOperator.GT:
66
+ data_row[self.slot_idx] = data_row[self._op1.slot_idx] > data_row[self._op2.slot_idx]
67
+ elif self.operator == ComparisonOperator.GE:
68
+ data_row[self.slot_idx] = data_row[self._op1.slot_idx] >= data_row[self._op2.slot_idx]
69
+
70
+ def _as_dict(self) -> Dict:
71
+ return {'operator': self.operator.value, **super()._as_dict()}
72
+
73
+ @classmethod
74
+ def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
75
+ assert 'operator' in d
76
+ return cls(ComparisonOperator(d['operator']), components[0], components[1])
77
+
@@ -0,0 +1,98 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, List, Any, Dict, Tuple, Callable
3
+ import operator
4
+
5
+ import sqlalchemy as sql
6
+
7
+ from .expr import Expr
8
+ from .globals import LogicalOperator
9
+ from .predicate import Predicate
10
+ from .data_row import DataRow
11
+ from .row_builder import RowBuilder
12
+ import pixeltable.catalog as catalog
13
+
14
+
15
+ class CompoundPredicate(Predicate):
16
+ def __init__(self, operator: LogicalOperator, operands: List[Predicate]):
17
+ super().__init__()
18
+ self.operator = operator
19
+ # operands are stored in self.components
20
+ if self.operator == LogicalOperator.NOT:
21
+ assert len(operands) == 1
22
+ self.components = operands
23
+ else:
24
+ assert len(operands) > 1
25
+ self.operands: List[Predicate] = []
26
+ for operand in operands:
27
+ self._merge_operand(operand)
28
+
29
+ self.id = self._create_id()
30
+
31
+ def __str__(self) -> str:
32
+ if self.operator == LogicalOperator.NOT:
33
+ return f'~({self.components[0]})'
34
+ return f' {self.operator} '.join([f'({e})' for e in self.components])
35
+
36
+ @classmethod
37
+ def make_conjunction(cls, operands: List[Predicate]) -> Optional[Predicate]:
38
+ if len(operands) == 0:
39
+ return None
40
+ if len(operands) == 1:
41
+ return operands[0]
42
+ return CompoundPredicate(LogicalOperator.AND, operands)
43
+
44
+ def _merge_operand(self, operand: Predicate) -> None:
45
+ """
46
+ Merge this operand, if possible, otherwise simply record it.
47
+ """
48
+ if isinstance(operand, CompoundPredicate) and operand.operator == self.operator:
49
+ # this can be merged
50
+ for child_op in operand.components:
51
+ self._merge_operand(child_op)
52
+ else:
53
+ self.components.append(operand)
54
+
55
+ def _equals(self, other: CompoundPredicate) -> bool:
56
+ return self.operator == other.operator
57
+
58
+ def _id_attrs(self) -> List[Tuple[str, Any]]:
59
+ return super()._id_attrs() + [('operator', self.operator.value)]
60
+
61
+ def split_conjuncts(
62
+ self, condition: Callable[[Predicate], bool]) -> Tuple[List[Predicate], Optional[Predicate]]:
63
+ if self.operator == LogicalOperator.OR or self.operator == LogicalOperator.NOT:
64
+ return super().split_conjuncts(condition)
65
+ matches = [op for op in self.components if condition(op)]
66
+ non_matches = [op for op in self.components if not condition(op)]
67
+ return (matches, self.make_conjunction(non_matches))
68
+
69
+ def sql_expr(self) -> Optional[sql.ClauseElement]:
70
+ sql_exprs = [op.sql_expr() for op in self.components]
71
+ if any(e is None for e in sql_exprs):
72
+ return None
73
+ if self.operator == LogicalOperator.NOT:
74
+ assert len(sql_exprs) == 1
75
+ return sql.not_(sql_exprs[0])
76
+ assert len(sql_exprs) > 1
77
+ operator = sql.and_ if self.operator == LogicalOperator.AND else sql.or_
78
+ combined = operator(*sql_exprs)
79
+ return combined
80
+
81
+ def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
82
+ if self.operator == LogicalOperator.NOT:
83
+ data_row[self.slot_idx] = not data_row[self.components[0].slot_idx]
84
+ else:
85
+ val = True if self.operator == LogicalOperator.AND else False
86
+ op_function = operator.and_ if self.operator == LogicalOperator.AND else operator.or_
87
+ for op in self.components:
88
+ val = op_function(val, data_row[op.slot_idx])
89
+ data_row[self.slot_idx] = val
90
+
91
+ def _as_dict(self) -> Dict:
92
+ return {'operator': self.operator.value, **super()._as_dict()}
93
+
94
+ @classmethod
95
+ def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
96
+ assert 'operator' in d
97
+ return cls(LogicalOperator(d['operator']), components)
98
+
@@ -0,0 +1,199 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import urllib.parse
5
+ import urllib.request
6
+ from typing import Optional, List, Any, Tuple
7
+
8
+ import sqlalchemy as sql
9
+ import pgvector.sqlalchemy
10
+ import PIL
11
+ import numpy as np
12
+
13
+
14
+ class DataRow:
15
+ """
16
+ Encapsulates all data and execution state needed by RowBuilder and DataRowBatch:
17
+ - state for in-memory computation
18
+ - state for storing the data
19
+ This is not meant to be a black-box abstraction.
20
+
21
+ In-memory representations by column type:
22
+ - StringType: str
23
+ - IntType: int
24
+ - FloatType: float
25
+ - BoolType: bool
26
+ - TimestampType: datetime.datetime
27
+ - JsonType: json-serializable object
28
+ - ArrayType: numpy.ndarray
29
+ - ImageType: PIL.Image.Image
30
+ - VideoType: local path if available, otherwise url
31
+ """
32
+ def __init__(self, size: int, img_slot_idxs: List[int], media_slot_idxs: List[int], array_slot_idxs: List[int]):
33
+ self.vals: List[Any] = [None] * size # either cell values or exceptions
34
+ self.has_val = [False] * size
35
+ self.excs: List[Optional[Exception]] = [None] * size
36
+
37
+ # control structures that are shared across all DataRows in a batch
38
+ self.img_slot_idxs = img_slot_idxs
39
+ self.media_slot_idxs = media_slot_idxs # all media types aside from image
40
+ self.array_slot_idxs = array_slot_idxs
41
+
42
+ # the primary key of a store row is a sequence of ints (the number is different for table vs view)
43
+ self.pk: Optional[Tuple[int, ...]] = None
44
+
45
+ # file_urls:
46
+ # - stored url of file for media in vals[i]
47
+ # - None if vals[i] is not media type
48
+ # - not None if file_paths[i] is not None
49
+ self.file_urls: List[Optional[str]] = [None] * size
50
+
51
+ # file_paths:
52
+ # - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
53
+ # - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
54
+ self.file_paths: List[Optional[str]] = [None] * size
55
+
56
+ def clear(self) -> None:
57
+ size = len(self.vals)
58
+ self.vals = [None] * size
59
+ self.has_val = [False] * size
60
+ self.excs = [None] * size
61
+ self.pk = None
62
+ self.file_urls = [None] * size
63
+ self.file_paths = [None] * size
64
+
65
+ def copy(self, target: DataRow) -> None:
66
+ """Create a copy of the contents of this DataRow in target
67
+ The copy shares the cell values, but not the control structures (eg, self.has_val), because these
68
+ need to be independently updateable.
69
+ """
70
+ target.vals = self.vals.copy()
71
+ target.has_val = self.has_val.copy()
72
+ target.excs = self.excs.copy()
73
+ target.pk = self.pk
74
+ target.file_urls = self.file_urls.copy()
75
+ target.file_paths = self.file_paths.copy()
76
+
77
+ def set_pk(self, pk: Tuple[int, ...]) -> None:
78
+ self.pk = pk
79
+
80
+ def has_exc(self, slot_idx: int) -> bool:
81
+ return self.excs[slot_idx] is not None
82
+
83
+ def get_exc(self, slot_idx: int) -> Exception:
84
+ assert self.has_val[slot_idx] is False
85
+ assert self.excs[slot_idx] is not None
86
+ return self.excs[slot_idx]
87
+
88
+ def set_exc(self, slot_idx: int, exc: Exception) -> None:
89
+ assert self.excs[slot_idx] is None
90
+ self.excs[slot_idx] = exc
91
+
92
+ if self.has_val[slot_idx]:
93
+ # eg. during validation, where contents of file is found invalid
94
+ self.has_val[slot_idx] = False
95
+ self.vals[slot_idx] = None
96
+ self.file_paths[slot_idx] = None
97
+ self.file_urls[slot_idx] = None
98
+
99
+ def __getitem__(self, index: object) -> Any:
100
+ """Returns in-memory value, ie, what is needed for expr evaluation"""
101
+ if not self.has_val[index]:
102
+ # for debugging purposes
103
+ pass
104
+ assert self.has_val[index], index
105
+
106
+ if self.file_urls[index] is not None and index in self.img_slot_idxs:
107
+ # if we need to load this from a file, it should have been materialized locally
108
+ assert self.file_paths[index] is not None
109
+ if self.vals[index] is None:
110
+ self.vals[index] = PIL.Image.open(self.file_paths[index])
111
+ self.vals[index].load()
112
+
113
+ return self.vals[index]
114
+
115
+ def get_stored_val(self, index: object, sa_col_type: Optional[sql.types.TypeEngine] = None) -> Any:
116
+ """Return the value that gets stored in the db"""
117
+ assert self.excs[index] is None
118
+ if not self.has_val[index]:
119
+ # for debugging purposes
120
+ pass
121
+ assert self.has_val[index]
122
+
123
+ if self.file_urls[index] is not None and (index in self.img_slot_idxs or index in self.media_slot_idxs):
124
+ # if this is an image or other media type we want to store, we should have a url
125
+ return self.file_urls[index]
126
+
127
+ if self.vals[index] is not None and index in self.array_slot_idxs:
128
+ assert isinstance(self.vals[index], np.ndarray)
129
+ np_array = self.vals[index]
130
+ if sa_col_type is not None and isinstance(sa_col_type, pgvector.sqlalchemy.Vector):
131
+ return np_array
132
+ buffer = io.BytesIO()
133
+ np.save(buffer, np_array)
134
+ return buffer.getvalue()
135
+
136
+ return self.vals[index]
137
+
138
+ def __setitem__(self, idx: object, val: Any) -> None:
139
+ """Assign in-memory cell value
140
+ This allows overwriting
141
+ """
142
+ assert self.excs[idx] is None
143
+
144
+ if (idx in self.img_slot_idxs or idx in self.media_slot_idxs) and isinstance(val, str):
145
+ # this is either a local file path or a URL
146
+ parsed = urllib.parse.urlparse(val)
147
+ # Determine if this is a local file or a remote URL. If the scheme length is <= 1,
148
+ # we assume it's a local file. (This is because a Windows path will be interpreted
149
+ # by urllib as a URL with scheme equal to the drive letter.)
150
+ if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
151
+ # local file path
152
+ assert self.file_urls[idx] is None and self.file_paths[idx] is None
153
+ if len(parsed.scheme) <= 1:
154
+ self.file_urls[idx] = urllib.parse.urljoin('file:', urllib.request.pathname2url(val))
155
+ self.file_paths[idx] = val
156
+ else:
157
+ self.file_urls[idx] = val
158
+ # Wrap the path in a url2pathname() call to ensure proper handling on Windows.
159
+ self.file_paths[idx] = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
160
+ else:
161
+ # URL
162
+ assert self.file_urls[idx] is None
163
+ self.file_urls[idx] = val
164
+
165
+ if idx in self.media_slot_idxs:
166
+ self.vals[idx] = self.file_paths[idx] if self.file_paths[idx] is not None else self.file_urls[idx]
167
+ elif idx in self.array_slot_idxs and isinstance(val, bytes):
168
+ self.vals[idx] = np.load(io.BytesIO(val))
169
+ else:
170
+ self.vals[idx] = val
171
+ self.has_val[idx] = True
172
+
173
+ def set_file_path(self, idx: object, path: str) -> None:
174
+ """Augment an existing url with a local file path"""
175
+ assert self.has_val[idx]
176
+ assert idx in self.img_slot_idxs or idx in self.media_slot_idxs
177
+ self.file_paths[idx] = path
178
+ if idx in self.media_slot_idxs:
179
+ self.vals[idx] = path
180
+
181
+ def flush_img(self, index: object, filepath: Optional[str] = None) -> None:
182
+ """Discard the in-memory value and save it to a local file, if filepath is not None"""
183
+ if self.vals[index] is None:
184
+ return
185
+ assert self.excs[index] is None
186
+ if self.file_paths[index] is None:
187
+ if filepath is not None:
188
+ # we want to save this to a file
189
+ self.file_paths[index] = filepath
190
+ self.file_urls[index] = urllib.parse.urljoin('file:', urllib.request.pathname2url(filepath))
191
+ self.vals[index].save(filepath, format='JPEG')
192
+ else:
193
+ # we discard the content of this cell
194
+ self.has_val[index] = False
195
+ else:
196
+ # we already have a file for this image, nothing left to do
197
+ pass
198
+ self.vals[index] = None
199
+