pixeltable 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show
  1. pixeltable/__init__.py +34 -6
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +590 -30
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +359 -45
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +195 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +34 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +256 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +122 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +418 -182
  88. pixeltable/tests/conftest.py +146 -88
  89. pixeltable/tests/functions/test_fireworks.py +42 -0
  90. pixeltable/tests/functions/test_functions.py +60 -0
  91. pixeltable/tests/functions/test_huggingface.py +158 -0
  92. pixeltable/tests/functions/test_openai.py +152 -0
  93. pixeltable/tests/functions/test_together.py +111 -0
  94. pixeltable/tests/test_audio.py +65 -0
  95. pixeltable/tests/test_catalog.py +27 -0
  96. pixeltable/tests/test_client.py +14 -14
  97. pixeltable/tests/test_component_view.py +370 -0
  98. pixeltable/tests/test_dataframe.py +439 -0
  99. pixeltable/tests/test_dirs.py +78 -62
  100. pixeltable/tests/test_document.py +120 -0
  101. pixeltable/tests/test_exprs.py +592 -135
  102. pixeltable/tests/test_function.py +297 -67
  103. pixeltable/tests/test_migration.py +43 -0
  104. pixeltable/tests/test_nos.py +54 -0
  105. pixeltable/tests/test_snapshot.py +208 -0
  106. pixeltable/tests/test_table.py +1195 -263
  107. pixeltable/tests/test_transactional_directory.py +42 -0
  108. pixeltable/tests/test_types.py +5 -11
  109. pixeltable/tests/test_video.py +151 -34
  110. pixeltable/tests/test_view.py +530 -0
  111. pixeltable/tests/utils.py +320 -45
  112. pixeltable/tool/create_test_db_dump.py +149 -0
  113. pixeltable/tool/create_test_video.py +81 -0
  114. pixeltable/type_system.py +445 -124
  115. pixeltable/utils/__init__.py +17 -46
  116. pixeltable/utils/arrow.py +98 -0
  117. pixeltable/utils/clip.py +12 -15
  118. pixeltable/utils/coco.py +136 -0
  119. pixeltable/utils/documents.py +39 -0
  120. pixeltable/utils/filecache.py +195 -0
  121. pixeltable/utils/help.py +11 -0
  122. pixeltable/utils/hf_datasets.py +157 -0
  123. pixeltable/utils/media_store.py +76 -0
  124. pixeltable/utils/parquet.py +167 -0
  125. pixeltable/utils/pytorch.py +91 -0
  126. pixeltable/utils/s3.py +13 -0
  127. pixeltable/utils/sql.py +17 -0
  128. pixeltable/utils/transactional_directory.py +35 -0
  129. pixeltable-0.2.4.dist-info/LICENSE +18 -0
  130. pixeltable-0.2.4.dist-info/METADATA +127 -0
  131. pixeltable-0.2.4.dist-info/RECORD +132 -0
  132. {pixeltable-0.1.0.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +1 -1
  133. pixeltable/catalog.py +0 -1421
  134. pixeltable/exprs.py +0 -1745
  135. pixeltable/function.py +0 -269
  136. pixeltable/functions/clip.py +0 -10
  137. pixeltable/functions/pil/__init__.py +0 -23
  138. pixeltable/functions/tf.py +0 -21
  139. pixeltable/index.py +0 -57
  140. pixeltable/tests/test_dict.py +0 -24
  141. pixeltable/tests/test_functions.py +0 -11
  142. pixeltable/tests/test_tf.py +0 -69
  143. pixeltable/tf.py +0 -33
  144. pixeltable/utils/tf.py +0 -33
  145. pixeltable/utils/video.py +0 -32
  146. pixeltable-0.1.0.dist-info/METADATA +0 -34
  147. pixeltable-0.1.0.dist-info/RECORD +0 -36
@@ -0,0 +1,211 @@
1
+ from typing import Optional, List, Dict, get_type_hints, Type, Any, TypeVar, Tuple, Union
2
+ import platform
3
+ import uuid
4
+ import dataclasses
5
+
6
+ import sqlalchemy as sql
7
+ from sqlalchemy import Integer, String, Boolean, BigInteger, LargeBinary
8
+ from sqlalchemy.dialects.postgresql import UUID, JSONB
9
+ from sqlalchemy import ForeignKey, UniqueConstraint, ForeignKeyConstraint
10
+ from sqlalchemy.orm import declarative_base
11
+
12
+ Base = declarative_base()
13
+
14
+ T = TypeVar('T')
15
+
16
+ def md_from_dict(data_class_type: Type[T], data: Any) -> T:
17
+ """Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
18
+ if dataclasses.is_dataclass(data_class_type):
19
+ fieldtypes = {f: t for f, t in get_type_hints(data_class_type).items()}
20
+ return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
21
+ elif hasattr(data_class_type, '__origin__'):
22
+ if data_class_type.__origin__ is Union and type(None) in data_class_type.__args__:
23
+ # Handling Optional types
24
+ non_none_args = [arg for arg in data_class_type.__args__ if arg is not type(None)]
25
+ if len(non_none_args) == 1:
26
+ return md_from_dict(non_none_args[0], data) if data is not None else None
27
+ elif data_class_type.__origin__ is list:
28
+ return [md_from_dict(data_class_type.__args__[0], elem) for elem in data]
29
+ elif data_class_type.__origin__ is dict:
30
+ key_type = data_class_type.__args__[0]
31
+ val_type = data_class_type.__args__[1]
32
+ return {key_type(key): md_from_dict(val_type, val) for key, val in data.items()}
33
+ elif data_class_type.__origin__ is tuple:
34
+ return tuple(md_from_dict(arg_type, elem) for arg_type, elem in zip(data_class_type.__args__, data))
35
+ else:
36
+ return data
37
+
38
+
39
+ # structure of the stored metadata:
40
+ # - each schema entity that grows somehow proportionally to the data (# of output_rows, total insert operations,
41
+ # number of schema changes) gets its own table
42
+ # - each table has an 'md' column that basically contains the payload
43
+ # - exceptions to that are foreign keys without which lookups would be too slow (ex.: TableSchemaVersions.tbl_id)
44
+ # - the md column contains a dataclass serialized to json; this has the advantage of making changes to the metadata
45
+ # schema easier (the goal is not to have to rely on some schema migration framework; if that breaks for some user,
46
+ # it would be very difficult to patch up)
47
+
48
+ @dataclasses.dataclass
49
+ class SystemInfoMd:
50
+ schema_version: int
51
+
52
+
53
+ class SystemInfo(Base):
54
+ """A single-row table that contains system-wide metadata."""
55
+ __tablename__ = 'systeminfo'
56
+ dummy = sql.Column(Integer, primary_key=True, default=0, nullable=False)
57
+ md = sql.Column(JSONB, nullable=False) # SystemInfoMd
58
+
59
+
60
+ @dataclasses.dataclass
61
+ class DirMd:
62
+ name: str
63
+
64
+
65
+ class Dir(Base):
66
+ __tablename__ = 'dirs'
67
+
68
+ id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
69
+ parent_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
70
+ md = sql.Column(JSONB, nullable=False)
71
+
72
+
73
+ @dataclasses.dataclass
74
+ class ColumnHistory:
75
+ """
76
+ Records when a column was added/dropped, which is needed to GC unreachable storage columns
77
+ (a column that was added after table snapshot n and dropped before table snapshot n+1 can be removed
78
+ from the stored table).
79
+ One record per column (across all schema versions).
80
+ """
81
+ col_id: int
82
+ schema_version_add: int
83
+ schema_version_drop: Optional[int]
84
+
85
+
86
+ @dataclasses.dataclass
87
+ class ViewMd:
88
+ is_snapshot: bool
89
+
90
+ # (table id, version); for mutable views, all versions are None
91
+ base_versions: List[Tuple[str, Optional[int]]]
92
+
93
+ # filter predicate applied to the base table; view-only
94
+ predicate: Optional[Dict[str, Any]]
95
+
96
+ # ComponentIterator subclass; only for component views
97
+ iterator_class_fqn: Optional[str]
98
+
99
+ # args to pass to the iterator class constructor; only for component views
100
+ iterator_args: Optional[Dict[str, Any]]
101
+
102
+
103
+ @dataclasses.dataclass
104
+ class TableMd:
105
+ name: str
106
+
107
+ # monotonically increasing w/in Table for both data and schema changes, starting at 0
108
+ current_version: int
109
+ # each version has a corresponding schema version (current_version >= current_schema_version)
110
+ current_schema_version: int
111
+
112
+ # used to assign Column.id
113
+ next_col_id: int
114
+
115
+ # - used to assign the rowid column in the storage table
116
+ # - every row is assigned a unique and immutable rowid on insertion
117
+ next_row_id: int
118
+
119
+ column_history: Dict[int, ColumnHistory] # col_id -> ColumnHistory
120
+
121
+ view_md: Optional[ViewMd]
122
+
123
+
124
+ class Table(Base):
125
+ """
126
+ Table represents both tables and views.
127
+
128
+ Views are in essence a subclass of tables, because they also store materialized columns. The differences are:
129
+ - views have a base, which is either a (live) table or a snapshot
130
+ - views can have a filter predicate
131
+ """
132
+ __tablename__ = 'tables'
133
+
134
+ MAX_VERSION = 9223372036854775807 # 2^63 - 1
135
+
136
+ id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
137
+ dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
138
+ md = sql.Column(JSONB, nullable=False) # TableMd
139
+
140
+
141
+ @dataclasses.dataclass
142
+ class TableVersionMd:
143
+ created_at: float # time.time()
144
+ version: int
145
+ schema_version: int
146
+
147
+
148
+ class TableVersion(Base):
149
+ __tablename__ = 'tableversions'
150
+ tbl_id = sql.Column(UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False)
151
+ version = sql.Column(BigInteger, primary_key=True, nullable=False)
152
+ md = sql.Column(JSONB, nullable=False) # TableVersionMd
153
+
154
+
155
+ @dataclasses.dataclass
156
+ class SchemaColumn:
157
+ """
158
+ Records the logical (user-visible) schema of a table.
159
+ Contains the full set of columns for each new schema version: one record per (column x schema version).
160
+ """
161
+ pos: int
162
+ name: str
163
+ col_type: dict
164
+ is_pk: bool
165
+ value_expr: Optional[dict]
166
+ stored: Optional[bool]
167
+ # if True, creates vector index for this column
168
+ is_indexed: bool
169
+
170
+
171
+ @dataclasses.dataclass
172
+ class TableSchemaVersionMd:
173
+ schema_version: int
174
+ preceding_schema_version: Optional[int]
175
+ columns: Dict[int, SchemaColumn] # col_id -> SchemaColumn
176
+ num_retained_versions: int
177
+ comment: str
178
+
179
+
180
+ # versioning: each table schema change results in a new record
181
+ class TableSchemaVersion(Base):
182
+ __tablename__ = 'tableschemaversions'
183
+
184
+ tbl_id = sql.Column(UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False)
185
+ schema_version = sql.Column(BigInteger, primary_key=True, nullable=False)
186
+ md = sql.Column(JSONB, nullable=False) # TableSchemaVersionMd
187
+
188
+
189
+ @dataclasses.dataclass
190
+ class FunctionMd:
191
+ name: str
192
+ py_version: str # platform.python_version
193
+ class_name: str # name of the Function subclass
194
+ md: dict # part of the output of Function.to_store()
195
+
196
+
197
+ class Function(Base):
198
+ """
199
+ User-defined functions that are not module functions (ie, aren't available at runtime as a symbol in a known
200
+ module).
201
+ Functions without a name are anonymous functions used in the definition of a computed column.
202
+ Functions that have names are also assigned to a database and directory.
203
+ We store the Python version under which a Function was created (and the callable pickled) in order to warn
204
+ against version mismatches.
205
+ """
206
+ __tablename__ = 'functions'
207
+
208
+ id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
209
+ dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
210
+ md = sql.Column(JSONB, nullable=False) # FunctionMd
211
+ binary_obj = sql.Column(LargeBinary, nullable=True)