pixeltable 0.1.2__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. pixeltable-0.2.1/LICENSE +18 -0
  2. pixeltable-0.2.1/PKG-INFO +119 -0
  3. pixeltable-0.2.1/README.md +87 -0
  4. pixeltable-0.2.1/pixeltable/__init__.py +44 -0
  5. pixeltable-0.2.1/pixeltable/catalog/__init__.py +13 -0
  6. pixeltable-0.2.1/pixeltable/catalog/catalog.py +159 -0
  7. pixeltable-0.2.1/pixeltable/catalog/column.py +200 -0
  8. pixeltable-0.2.1/pixeltable/catalog/dir.py +32 -0
  9. pixeltable-0.2.1/pixeltable/catalog/globals.py +33 -0
  10. pixeltable-0.2.1/pixeltable/catalog/insertable_table.py +191 -0
  11. pixeltable-0.2.1/pixeltable/catalog/named_function.py +36 -0
  12. pixeltable-0.2.1/pixeltable/catalog/path.py +58 -0
  13. pixeltable-0.2.1/pixeltable/catalog/path_dict.py +139 -0
  14. pixeltable-0.2.1/pixeltable/catalog/schema_object.py +39 -0
  15. pixeltable-0.2.1/pixeltable/catalog/table.py +581 -0
  16. pixeltable-0.2.1/pixeltable/catalog/table_version.py +749 -0
  17. pixeltable-0.2.1/pixeltable/catalog/table_version_path.py +133 -0
  18. pixeltable-0.2.1/pixeltable/catalog/view.py +203 -0
  19. pixeltable-0.2.1/pixeltable/client.py +534 -0
  20. pixeltable-0.2.1/pixeltable/dataframe.py +631 -0
  21. pixeltable-0.2.1/pixeltable/env.py +414 -0
  22. pixeltable-0.2.1/pixeltable/exceptions.py +17 -0
  23. pixeltable-0.2.1/pixeltable/exec/__init__.py +9 -0
  24. pixeltable-0.2.1/pixeltable/exec/aggregation_node.py +78 -0
  25. pixeltable-0.2.1/pixeltable/exec/cache_prefetch_node.py +113 -0
  26. pixeltable-0.2.1/pixeltable/exec/component_iteration_node.py +79 -0
  27. pixeltable-0.2.1/pixeltable/exec/data_row_batch.py +95 -0
  28. pixeltable-0.2.1/pixeltable/exec/exec_context.py +22 -0
  29. pixeltable-0.2.1/pixeltable/exec/exec_node.py +61 -0
  30. pixeltable-0.2.1/pixeltable/exec/expr_eval_node.py +217 -0
  31. pixeltable-0.2.1/pixeltable/exec/in_memory_data_node.py +69 -0
  32. pixeltable-0.2.1/pixeltable/exec/media_validation_node.py +43 -0
  33. pixeltable-0.2.1/pixeltable/exec/sql_scan_node.py +225 -0
  34. pixeltable-0.2.1/pixeltable/exprs/__init__.py +24 -0
  35. pixeltable-0.2.1/pixeltable/exprs/arithmetic_expr.py +102 -0
  36. pixeltable-0.2.1/pixeltable/exprs/array_slice.py +71 -0
  37. pixeltable-0.2.1/pixeltable/exprs/column_property_ref.py +77 -0
  38. pixeltable-0.2.1/pixeltable/exprs/column_ref.py +105 -0
  39. pixeltable-0.2.1/pixeltable/exprs/comparison.py +77 -0
  40. pixeltable-0.2.1/pixeltable/exprs/compound_predicate.py +98 -0
  41. pixeltable-0.2.1/pixeltable/exprs/data_row.py +187 -0
  42. pixeltable-0.2.1/pixeltable/exprs/expr.py +586 -0
  43. pixeltable-0.2.1/pixeltable/exprs/expr_set.py +39 -0
  44. pixeltable-0.2.1/pixeltable/exprs/function_call.py +380 -0
  45. pixeltable-0.2.1/pixeltable/exprs/globals.py +69 -0
  46. pixeltable-0.2.1/pixeltable/exprs/image_member_access.py +115 -0
  47. pixeltable-0.2.1/pixeltable/exprs/image_similarity_predicate.py +58 -0
  48. pixeltable-0.2.1/pixeltable/exprs/inline_array.py +107 -0
  49. pixeltable-0.2.1/pixeltable/exprs/inline_dict.py +101 -0
  50. pixeltable-0.2.1/pixeltable/exprs/is_null.py +38 -0
  51. pixeltable-0.2.1/pixeltable/exprs/json_mapper.py +121 -0
  52. pixeltable-0.2.1/pixeltable/exprs/json_path.py +159 -0
  53. pixeltable-0.2.1/pixeltable/exprs/literal.py +54 -0
  54. pixeltable-0.2.1/pixeltable/exprs/object_ref.py +41 -0
  55. pixeltable-0.2.1/pixeltable/exprs/predicate.py +44 -0
  56. pixeltable-0.2.1/pixeltable/exprs/row_builder.py +355 -0
  57. pixeltable-0.2.1/pixeltable/exprs/rowid_ref.py +94 -0
  58. pixeltable-0.2.1/pixeltable/exprs/type_cast.py +53 -0
  59. pixeltable-0.2.1/pixeltable/exprs/variable.py +45 -0
  60. pixeltable-0.2.1/pixeltable/func/__init__.py +9 -0
  61. pixeltable-0.2.1/pixeltable/func/aggregate_function.py +194 -0
  62. pixeltable-0.2.1/pixeltable/func/batched_function.py +53 -0
  63. pixeltable-0.2.1/pixeltable/func/callable_function.py +69 -0
  64. pixeltable-0.2.1/pixeltable/func/expr_template_function.py +82 -0
  65. pixeltable-0.2.1/pixeltable/func/function.py +110 -0
  66. pixeltable-0.2.1/pixeltable/func/function_registry.py +227 -0
  67. pixeltable-0.2.1/pixeltable/func/globals.py +36 -0
  68. pixeltable-0.2.1/pixeltable/func/nos_function.py +202 -0
  69. pixeltable-0.2.1/pixeltable/func/signature.py +166 -0
  70. pixeltable-0.2.1/pixeltable/func/udf.py +163 -0
  71. pixeltable-0.2.1/pixeltable/functions/__init__.py +95 -0
  72. pixeltable-0.2.1/pixeltable/functions/eval.py +216 -0
  73. pixeltable-0.2.1/pixeltable/functions/fireworks.py +61 -0
  74. pixeltable-0.2.1/pixeltable/functions/huggingface.py +120 -0
  75. pixeltable-0.2.1/pixeltable/functions/image.py +16 -0
  76. pixeltable-0.2.1/pixeltable/functions/openai.py +88 -0
  77. pixeltable-0.2.1/pixeltable/functions/pil/image.py +150 -0
  78. pixeltable-0.2.1/pixeltable/functions/string.py +13 -0
  79. pixeltable-0.2.1/pixeltable/functions/together.py +27 -0
  80. pixeltable-0.2.1/pixeltable/functions/util.py +41 -0
  81. pixeltable-0.2.1/pixeltable/functions/video.py +62 -0
  82. pixeltable-0.2.1/pixeltable/iterators/__init__.py +3 -0
  83. pixeltable-0.2.1/pixeltable/iterators/base.py +48 -0
  84. pixeltable-0.2.1/pixeltable/iterators/document.py +311 -0
  85. pixeltable-0.2.1/pixeltable/iterators/video.py +89 -0
  86. pixeltable-0.2.1/pixeltable/metadata/__init__.py +54 -0
  87. pixeltable-0.2.1/pixeltable/metadata/converters/convert_10.py +18 -0
  88. pixeltable-0.2.1/pixeltable/metadata/schema.py +211 -0
  89. pixeltable-0.2.1/pixeltable/plan.py +656 -0
  90. pixeltable-0.2.1/pixeltable/store.py +422 -0
  91. pixeltable-0.2.1/pixeltable/tests/conftest.py +175 -0
  92. pixeltable-0.2.1/pixeltable/tests/test_audio.py +65 -0
  93. pixeltable-0.2.1/pixeltable/tests/test_catalog.py +27 -0
  94. pixeltable-0.2.1/pixeltable/tests/test_client.py +21 -0
  95. pixeltable-0.2.1/pixeltable/tests/test_component_view.py +372 -0
  96. pixeltable-0.2.1/pixeltable/tests/test_dataframe.py +433 -0
  97. pixeltable-0.2.1/pixeltable/tests/test_dirs.py +107 -0
  98. pixeltable-0.2.1/pixeltable/tests/test_document.py +117 -0
  99. pixeltable-0.2.1/pixeltable/tests/test_exprs.py +804 -0
  100. pixeltable-0.2.1/pixeltable/tests/test_function.py +324 -0
  101. pixeltable-0.2.1/pixeltable/tests/test_functions.py +293 -0
  102. pixeltable-0.2.1/pixeltable/tests/test_migration.py +43 -0
  103. pixeltable-0.2.1/pixeltable/tests/test_nos.py +54 -0
  104. pixeltable-0.2.1/pixeltable/tests/test_snapshot.py +208 -0
  105. pixeltable-0.2.1/pixeltable/tests/test_table.py +1158 -0
  106. pixeltable-0.2.1/pixeltable/tests/test_transactional_directory.py +42 -0
  107. {pixeltable-0.1.2 → pixeltable-0.2.1}/pixeltable/tests/test_types.py +5 -11
  108. pixeltable-0.2.1/pixeltable/tests/test_video.py +157 -0
  109. pixeltable-0.2.1/pixeltable/tests/test_view.py +530 -0
  110. pixeltable-0.2.1/pixeltable/tests/utils.py +274 -0
  111. pixeltable-0.2.1/pixeltable/tool/create_test_db_dump.py +149 -0
  112. pixeltable-0.2.1/pixeltable/type_system.py +938 -0
  113. pixeltable-0.2.1/pixeltable/utils/__init__.py +17 -0
  114. pixeltable-0.2.1/pixeltable/utils/clip.py +18 -0
  115. pixeltable-0.2.1/pixeltable/utils/coco.py +136 -0
  116. pixeltable-0.2.1/pixeltable/utils/documents.py +39 -0
  117. pixeltable-0.2.1/pixeltable/utils/filecache.py +195 -0
  118. pixeltable-0.2.1/pixeltable/utils/help.py +11 -0
  119. pixeltable-0.2.1/pixeltable/utils/media_store.py +76 -0
  120. pixeltable-0.2.1/pixeltable/utils/parquet.py +126 -0
  121. pixeltable-0.2.1/pixeltable/utils/pytorch.py +172 -0
  122. pixeltable-0.2.1/pixeltable/utils/s3.py +13 -0
  123. pixeltable-0.2.1/pixeltable/utils/sql.py +17 -0
  124. pixeltable-0.2.1/pixeltable/utils/transactional_directory.py +35 -0
  125. pixeltable-0.2.1/pyproject.toml +122 -0
  126. pixeltable-0.1.2/LICENSE +0 -201
  127. pixeltable-0.1.2/PKG-INFO +0 -89
  128. pixeltable-0.1.2/README.md +0 -61
  129. pixeltable-0.1.2/pixeltable/__init__.py +0 -27
  130. pixeltable-0.1.2/pixeltable/catalog.py +0 -1421
  131. pixeltable-0.1.2/pixeltable/client.py +0 -45
  132. pixeltable-0.1.2/pixeltable/dataframe.py +0 -440
  133. pixeltable-0.1.2/pixeltable/env.py +0 -89
  134. pixeltable-0.1.2/pixeltable/exceptions.py +0 -26
  135. pixeltable-0.1.2/pixeltable/exprs.py +0 -1745
  136. pixeltable-0.1.2/pixeltable/function.py +0 -269
  137. pixeltable-0.1.2/pixeltable/functions/__init__.py +0 -146
  138. pixeltable-0.1.2/pixeltable/functions/clip.py +0 -10
  139. pixeltable-0.1.2/pixeltable/functions/pil/__init__.py +0 -23
  140. pixeltable-0.1.2/pixeltable/functions/pil/image.py +0 -9
  141. pixeltable-0.1.2/pixeltable/functions/tf.py +0 -21
  142. pixeltable-0.1.2/pixeltable/index.py +0 -57
  143. pixeltable-0.1.2/pixeltable/store.py +0 -191
  144. pixeltable-0.1.2/pixeltable/tests/conftest.py +0 -118
  145. pixeltable-0.1.2/pixeltable/tests/test_client.py +0 -21
  146. pixeltable-0.1.2/pixeltable/tests/test_dict.py +0 -24
  147. pixeltable-0.1.2/pixeltable/tests/test_dirs.py +0 -91
  148. pixeltable-0.1.2/pixeltable/tests/test_exprs.py +0 -348
  149. pixeltable-0.1.2/pixeltable/tests/test_function.py +0 -94
  150. pixeltable-0.1.2/pixeltable/tests/test_functions.py +0 -11
  151. pixeltable-0.1.2/pixeltable/tests/test_table.py +0 -330
  152. pixeltable-0.1.2/pixeltable/tests/test_tf.py +0 -69
  153. pixeltable-0.1.2/pixeltable/tests/test_video.py +0 -42
  154. pixeltable-0.1.2/pixeltable/tests/utils.py +0 -133
  155. pixeltable-0.1.2/pixeltable/tf.py +0 -33
  156. pixeltable-0.1.2/pixeltable/type_system.py +0 -581
  157. pixeltable-0.1.2/pixeltable/utils/__init__.py +0 -46
  158. pixeltable-0.1.2/pixeltable/utils/clip.py +0 -21
  159. pixeltable-0.1.2/pixeltable/utils/tf.py +0 -33
  160. pixeltable-0.1.2/pixeltable/utils/video.py +0 -32
  161. pixeltable-0.1.2/pyproject.toml +0 -48
  162. pixeltable-0.1.2/setup.py +0 -47
@@ -0,0 +1,18 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+
6
+ Copyright 2023 Marcel Kornacker
7
+
8
+ Licensed under the Apache License, Version 2.0 (the "License");
9
+ you may not use this file except in compliance with the License.
10
+ You may obtain a copy of the License at
11
+
12
+ http://www.apache.org/licenses/LICENSE-2.0
13
+
14
+ Unless required by applicable law or agreed to in writing, software
15
+ distributed under the License is distributed on an "AS IS" BASIS,
16
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ See the License for the specific language governing permissions and
18
+ limitations under the License.
@@ -0,0 +1,119 @@
1
+ Metadata-Version: 2.1
2
+ Name: pixeltable
3
+ Version: 0.2.1
4
+ Summary: Pixeltable: The Multimodal AI Data Plane
5
+ Author: Marcel Kornacker
6
+ Author-email: marcelk@gmail.com
7
+ Requires-Python: >=3.9,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Requires-Dist: av (>=10.0.0)
14
+ Requires-Dist: cloudpickle (>=2.2.1,<3.0.0)
15
+ Requires-Dist: jinja2 (>=3.1.3,<4.0.0)
16
+ Requires-Dist: jmespath (>=1.0.1,<2.0.0)
17
+ Requires-Dist: numpy (>=1.24.1,<2.0.0)
18
+ Requires-Dist: opencv-python-headless (>=4.7.0.68,<5.0.0.0)
19
+ Requires-Dist: pandas (>=1.5.3,<2.0.0)
20
+ Requires-Dist: pgserver (==0.0.5)
21
+ Requires-Dist: pgvector (>=0.2.1,<0.3.0)
22
+ Requires-Dist: pillow (>=9.4.0,<10.0.0)
23
+ Requires-Dist: psutil (>=5.9.5,<6.0.0)
24
+ Requires-Dist: psycopg2-binary (>=2.9.5,<3.0.0)
25
+ Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
26
+ Requires-Dist: regex (>=2022.10.31,<2023.0.0)
27
+ Requires-Dist: sqlalchemy-utils (>=0.41.1,<0.42.0)
28
+ Requires-Dist: sqlalchemy[mypy] (>=2.0.23,<3.0.0)
29
+ Requires-Dist: tqdm (>=4.64.1,<5.0.0)
30
+ Description-Content-Type: text/markdown
31
+
32
+ <img src="docs/pixeltable-banner.png" width="45%"/>
33
+
34
+ # Pixeltable: The Multimodal AI Data Plane
35
+
36
+ [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
37
+ &nbsp;&nbsp;
38
+ ![pytest status](https://github.com/pixeltable/pixeltable/actions/workflows/pytest.yml/badge.svg)
39
+
40
+ Pixeltable is a Python library that lets AI engineers and data scientists focus on
41
+ exploration, modeling, and app development without having to deal with the customary
42
+ data plumbing.
43
+
44
+ **Pixeltable redefines data infrastructure and workflow orchestration for AI development.**
45
+ It brings together data storage, versioning, and indexing with orchestration and model
46
+ versioning under a declarative table interface, with transformations, model inference,
47
+ and custom logic represented as computed columns.
48
+ <!--
49
+ ## Quick Start
50
+
51
+ If you just want to play around with Pixeltable to see what it's capable of, the easiest way is to run
52
+ the Pixeltable Basics tutorial in colab:
53
+
54
+ <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/docs/tutorials/pixeltable-basics.ipynb">
55
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
56
+ </a>
57
+ -->
58
+ ## Installation
59
+
60
+ Pixeltable works with Python 3.9, 3.10, or 3.11 running on Linux or MacOS.
61
+
62
+ ```
63
+ pip install pixeltable
64
+ ```
65
+
66
+ To verify that it's working:
67
+
68
+ ```
69
+ import pixeltable as pxt
70
+ cl = pxt.Client()
71
+ ```
72
+
73
+ For more detailed installation instructions, see the
74
+ [Getting Started with Pixeltable](https://pixeltable.github.io/pixeltable/getting-started/)
75
+ guide. Then, check out the
76
+ [Pixeltable Basics](https://pixeltable.github.io/pixeltable/tutorials/pixeltable-basics/)
77
+ tutorial for a tour of its most important features.
78
+
79
+ ## What problems does Pixeltable solve?
80
+
81
+ Today’s solutions for AI app development require extensive custom coding and infrastructure
82
+ plumbing. Tracking lineage and versions between and across data transformations, models, and
83
+ deployment is cumbersome. Pixeltable is a replacement for traditional data plumbing, providing
84
+ a unified plane for data, models, and orchestration. It removes the data plumbing overhead in
85
+ building and productionizing AI applications.
86
+
87
+ ## Why should you use Pixeltable?
88
+
89
+ - It gives you transparency and reproducibility
90
+ - All generated data is automatically recorded and versioned
91
+ - You will never need to re-run a workload because you lost track of the input data
92
+ - It saves you money
93
+ - All data changes are automatically incremental
94
+ - You never need to re-run pipelines from scratch because you’re adding data
95
+ - It integrates with any existing Python code or libraries
96
+ - Bring your ever-changing code and workloads
97
+ - You choose the models, tools, and AI practices (e.g., your embedding model for a vector index); Pixeltable orchestrates the data
98
+
99
+ ## Example Use Cases
100
+
101
+ * Interact with video data at the frame level without having to think about frame extraction,
102
+ intermediate file storage, or storage space explosion.
103
+ * Augment your data incrementally and interactively with built-in functions and UDFs, such as
104
+ image transformations, model inference, and visualizations, without having to think about data pipelines,
105
+ incremental updates, or capturing function output.
106
+ * Interact with all the data relevant to your AI application (video, images, documents, audio, structured data, JSON) through
107
+ a simple dataframe-style API directly in Python. This includes:
108
+ * similarity search on embeddings, supported by high-dimensional vector indexing
109
+ * path expressions and transformations on JSON data
110
+ * PIL and OpenCV image operations
111
+ * assembling frames into videos
112
+ * Perform keyword and image similarity search at the video frame level without having to worry about frame
113
+ storage.
114
+ * Access all Pixeltable-resident data directly as a PyTorch dataset in your training scripts.
115
+ * Understand the compute and storage costs of your data at the granularity of individual augmentations and
116
+ get cost projections before adding new data and new augmentations.
117
+ * Rely on Pixeltable's automatic versioning and snapshot functionality to protect against regressions
118
+ and to ensure reproducibility.
119
+
@@ -0,0 +1,87 @@
1
+ <img src="docs/pixeltable-banner.png" width="45%"/>
2
+
3
+ # Pixeltable: The Multimodal AI Data Plane
4
+
5
+ [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
6
+ &nbsp;&nbsp;
7
+ ![pytest status](https://github.com/pixeltable/pixeltable/actions/workflows/pytest.yml/badge.svg)
8
+
9
+ Pixeltable is a Python library that lets AI engineers and data scientists focus on
10
+ exploration, modeling, and app development without having to deal with the customary
11
+ data plumbing.
12
+
13
+ **Pixeltable redefines data infrastructure and workflow orchestration for AI development.**
14
+ It brings together data storage, versioning, and indexing with orchestration and model
15
+ versioning under a declarative table interface, with transformations, model inference,
16
+ and custom logic represented as computed columns.
17
+ <!--
18
+ ## Quick Start
19
+
20
+ If you just want to play around with Pixeltable to see what it's capable of, the easiest way is to run
21
+ the Pixeltable Basics tutorial in colab:
22
+
23
+ <a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/docs/tutorials/pixeltable-basics.ipynb">
24
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
25
+ </a>
26
+ -->
27
+ ## Installation
28
+
29
+ Pixeltable works with Python 3.9, 3.10, or 3.11 running on Linux or MacOS.
30
+
31
+ ```
32
+ pip install pixeltable
33
+ ```
34
+
35
+ To verify that it's working:
36
+
37
+ ```
38
+ import pixeltable as pxt
39
+ cl = pxt.Client()
40
+ ```
41
+
42
+ For more detailed installation instructions, see the
43
+ [Getting Started with Pixeltable](https://pixeltable.github.io/pixeltable/getting-started/)
44
+ guide. Then, check out the
45
+ [Pixeltable Basics](https://pixeltable.github.io/pixeltable/tutorials/pixeltable-basics/)
46
+ tutorial for a tour of its most important features.
47
+
48
+ ## What problems does Pixeltable solve?
49
+
50
+ Today’s solutions for AI app development require extensive custom coding and infrastructure
51
+ plumbing. Tracking lineage and versions between and across data transformations, models, and
52
+ deployment is cumbersome. Pixeltable is a replacement for traditional data plumbing, providing
53
+ a unified plane for data, models, and orchestration. It removes the data plumbing overhead in
54
+ building and productionizing AI applications.
55
+
56
+ ## Why should you use Pixeltable?
57
+
58
+ - It gives you transparency and reproducibility
59
+ - All generated data is automatically recorded and versioned
60
+ - You will never need to re-run a workload because you lost track of the input data
61
+ - It saves you money
62
+ - All data changes are automatically incremental
63
+ - You never need to re-run pipelines from scratch because you’re adding data
64
+ - It integrates with any existing Python code or libraries
65
+ - Bring your ever-changing code and workloads
66
+ - You choose the models, tools, and AI practices (e.g., your embedding model for a vector index); Pixeltable orchestrates the data
67
+
68
+ ## Example Use Cases
69
+
70
+ * Interact with video data at the frame level without having to think about frame extraction,
71
+ intermediate file storage, or storage space explosion.
72
+ * Augment your data incrementally and interactively with built-in functions and UDFs, such as
73
+ image transformations, model inference, and visualizations, without having to think about data pipelines,
74
+ incremental updates, or capturing function output.
75
+ * Interact with all the data relevant to your AI application (video, images, documents, audio, structured data, JSON) through
76
+ a simple dataframe-style API directly in Python. This includes:
77
+ * similarity search on embeddings, supported by high-dimensional vector indexing
78
+ * path expressions and transformations on JSON data
79
+ * PIL and OpenCV image operations
80
+ * assembling frames into videos
81
+ * Perform keyword and image similarity search at the video frame level without having to worry about frame
82
+ storage.
83
+ * Access all Pixeltable-resident data directly as a PyTorch dataset in your training scripts.
84
+ * Understand the compute and storage costs of your data at the granularity of individual augmentations and
85
+ get cost projections before adding new data and new augmentations.
86
+ * Rely on Pixeltable's automatic versioning and snapshot functionality to protect against regressions
87
+ and to ensure reproducibility.
@@ -0,0 +1,44 @@
1
+ from .catalog import Column, Table, InsertableTable, View
2
+ from .client import Client
3
+ from .dataframe import DataFrame
4
+ from .exceptions import Error, Error
5
+ from .exprs import RELATIVE_PATH_ROOT
6
+ from .func import Function, udf, uda, Aggregator, expr_udf
7
+ from .type_system import \
8
+ ColumnType, StringType, IntType, FloatType, BoolType, TimestampType, JsonType, ArrayType, ImageType, VideoType, \
9
+ AudioType, DocumentType
10
+ from .utils.help import help
11
+ # noinspection PyUnresolvedReferences
12
+ from . import functions
13
+
14
+ __all__ = [
15
+ 'Client',
16
+ 'DataFrame',
17
+ 'Column',
18
+ 'Table',
19
+ 'InsertableTable',
20
+ 'View',
21
+ 'Error',
22
+ 'ColumnType',
23
+ 'StringType',
24
+ 'IntType',
25
+ 'FloatType',
26
+ 'BoolType',
27
+ 'TimestampType',
28
+ 'JsonType',
29
+ 'RELATIVE_PATH_ROOT',
30
+ 'ArrayType',
31
+ 'ImageType',
32
+ 'VideoType',
33
+ 'AudioType',
34
+ 'DocumentType',
35
+ 'Function',
36
+ 'help',
37
+ 'udf',
38
+ 'Aggregator',
39
+ 'uda',
40
+ 'expr_udf',
41
+ ]
42
+
43
+
44
+
@@ -0,0 +1,13 @@
1
+ from .catalog import Catalog
2
+ from .column import Column
3
+ from .table_version_path import TableVersionPath
4
+ from .table_version import TableVersion
5
+ from .schema_object import SchemaObject
6
+ from .named_function import NamedFunction
7
+ from .dir import Dir
8
+ from .table import Table
9
+ from .insertable_table import InsertableTable
10
+ from .view import View
11
+ from .path import Path
12
+ from .path_dict import PathDict
13
+ from .globals import is_valid_identifier, is_valid_path
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+ from typing import Optional, List, Any, Dict, Tuple
3
+ from uuid import UUID
4
+ import dataclasses
5
+ import logging
6
+
7
+ import sqlalchemy as sql
8
+ import sqlalchemy.orm as orm
9
+
10
+ from .table_version import TableVersion
11
+ from .table_version_path import TableVersionPath
12
+ from .table import Table
13
+ from .named_function import NamedFunction
14
+ from .path_dict import PathDict
15
+ import pixeltable.env as env
16
+ import pixeltable.metadata.schema as schema
17
+
18
+ _logger = logging.getLogger('pixeltable')
19
+
20
+ class Catalog:
21
+ """A repository of catalog objects"""
22
+ _instance: Optional[Catalog] = None
23
+
24
+ @classmethod
25
+ def get(cls) -> Catalog:
26
+ if cls._instance is None:
27
+ cls._instance = cls()
28
+ with orm.Session(env.Env.get().engine, future=True) as session:
29
+ cls._instance._load_table_versions(session)
30
+ #cls._instance._load_functions(session)
31
+ return cls._instance
32
+
33
+ @classmethod
34
+ def clear(cls) -> None:
35
+ """Remove the instance. Used for testing."""
36
+ cls._instance = None
37
+
38
+ def __init__(self) -> None:
39
+ # key: [id, version]
40
+ # - mutable version of a table: version == None (even though TableVersion.version is set correctly)
41
+ # - snapshot versions: records the version of the snapshot
42
+ self.tbl_versions: Dict[Tuple[UUID, int], TableVersion] = {}
43
+
44
+ self.tbls: Dict[UUID, Table] = {} # don't use a defaultdict here, it doesn't cooperate with the debugger
45
+ self.tbl_dependents: Dict[UUID, List[Table]] = {}
46
+
47
+ self._init_store()
48
+ self.paths = PathDict() # do this after _init_catalog()
49
+
50
+ def _init_store(self) -> None:
51
+ """One-time initialization of the stored catalog. Idempotent."""
52
+ with orm.Session(env.Env.get().engine, future=True) as session:
53
+ if session.query(sql.func.count(schema.Dir.id)).scalar() > 0:
54
+ return
55
+ # create a top-level directory, so that every schema object has a directory
56
+ dir_md = schema.DirMd(name='')
57
+ dir_record = schema.Dir(parent_id=None, md=dataclasses.asdict(dir_md))
58
+ session.add(dir_record)
59
+ session.flush()
60
+ session.commit()
61
+ _logger.info(f'Initialized catalog')
62
+
63
+ def _load_snapshot_version(
64
+ self, tbl_id: UUID, version: int, base: Optional[TableVersion], session: orm.Session
65
+ ) -> TableVersion:
66
+ q = session.query(schema.Table, schema.TableSchemaVersion) \
67
+ .select_from(schema.Table) \
68
+ .join(schema.TableVersion) \
69
+ .join(schema.TableSchemaVersion) \
70
+ .where(schema.Table.id == tbl_id) \
71
+ .where(sql.text(f"({schema.TableVersion.__table__}.md->>'version')::int = {version}")) \
72
+ .where(sql.text((
73
+ f"({schema.TableVersion.__table__}.md->>'schema_version')::int = "
74
+ f"{schema.TableSchemaVersion.__table__}.{schema.TableSchemaVersion.schema_version.name}")))
75
+ tbl_record, schema_version_record = q.one()
76
+ tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
77
+ schema_version_md = schema.md_from_dict(schema.TableSchemaVersionMd, schema_version_record.md)
78
+ # we ignore tbl_record.base_tbl_id/base_snapshot_id and use 'base' instead: if the base is a snapshot
79
+ # we'd have to look that up first
80
+ return TableVersion(tbl_record.id, tbl_md, version, schema_version_md, is_snapshot=True, base=base)
81
+
82
+ def _load_table_versions(self, session: orm.Session) -> None:
83
+ from .insertable_table import InsertableTable
84
+ from .view import View
85
+
86
+ # load tables/views;
87
+ # do this in ascending order of creation ts so that we can resolve base references in one pass
88
+ q = session.query(schema.Table, schema.TableSchemaVersion) \
89
+ .select_from(schema.Table) \
90
+ .join(schema.TableVersion) \
91
+ .join(schema.TableSchemaVersion) \
92
+ .where(sql.text(f"({schema.TableVersion.__table__}.md->>'version')::int = 0")) \
93
+ .where(sql.text((
94
+ f"({schema.Table.__table__}.md->>'current_schema_version')::int = "
95
+ f"{schema.TableSchemaVersion.__table__}.{schema.TableSchemaVersion.schema_version.name}"))) \
96
+ .order_by(sql.text(f"({schema.TableVersion.__table__}.md->>'created_at')::float"))
97
+
98
+ for tbl_record, schema_version_record in q.all():
99
+ tbl_md = schema.md_from_dict(schema.TableMd, tbl_record.md)
100
+ schema_version_md = schema.md_from_dict(schema.TableSchemaVersionMd, schema_version_record.md)
101
+ view_md = tbl_md.view_md
102
+
103
+ if view_md is not None:
104
+ assert len(view_md.base_versions) > 0
105
+ # construct a TableVersionPath for the view
106
+ refd_versions = [(UUID(tbl_id), version) for tbl_id, version in view_md.base_versions]
107
+ base_path: Optional[TableVersionPath] = None
108
+ base: Optional[TableVersion] = None
109
+ # go through the versions in reverse order, so we can construct TableVersionPaths
110
+ for base_id, version in refd_versions[::-1]:
111
+ base_version = self.tbl_versions.get((base_id, version), None)
112
+ if base_version is None:
113
+ if version is None:
114
+ # debugging
115
+ pass
116
+ # if this is a reference to a mutable table, we should have loaded it already
117
+ assert version is not None
118
+ base_version = self._load_snapshot_version(base_id, version, base, session)
119
+ base_path = TableVersionPath(base_version, base=base_path)
120
+ base = base_version
121
+ assert base_path is not None
122
+
123
+ base_tbl = self.tbls[base_path.tbl_version.id]
124
+ is_snapshot = view_md is not None and view_md.is_snapshot
125
+ snapshot_only = is_snapshot and view_md.predicate is None and len(schema_version_md.columns) == 0
126
+ if snapshot_only:
127
+ # this is a pure snapshot, without a physical table backing it
128
+ view_path = base_path
129
+ else:
130
+ tbl_version = TableVersion(
131
+ tbl_record.id, tbl_md, tbl_md.current_version, schema_version_md, is_snapshot=is_snapshot,
132
+ base=base_path.tbl_version if is_snapshot else None,
133
+ base_path=base_path if not is_snapshot else None)
134
+ view_path = TableVersionPath(tbl_version, base=base_path)
135
+
136
+ tbl = View(
137
+ tbl_record.id, tbl_record.dir_id, tbl_md.name, view_path, base_tbl,
138
+ snapshot_only=snapshot_only)
139
+ self.tbl_dependents[base_tbl._id].append(tbl)
140
+
141
+ else:
142
+ tbl_version = TableVersion(tbl_record.id, tbl_md, tbl_md.current_version, schema_version_md)
143
+ tbl = InsertableTable(tbl_record.dir_id, tbl_version)
144
+
145
+ self.tbls[tbl._id] = tbl
146
+ self.tbl_dependents[tbl._id] = []
147
+ self.paths.add_schema_obj(tbl._dir_id, tbl_md.name, tbl)
148
+
149
+ # def _load_functions(self, session: orm.Session) -> None:
150
+ # # load Function metadata; doesn't load the actual callable, which can be large and is only done on-demand by the
151
+ # # FunctionRegistry
152
+ # q = session.query(schema.Function.id, schema.Function.dir_id, schema.Function.md) \
153
+ # .where(sql.text(f"({schema.Function.__table__}.md->>'name')::text IS NOT NULL"))
154
+ # for id, dir_id, md in q.all():
155
+ # assert 'name' in md
156
+ # name = md['name']
157
+ # assert name is not None
158
+ # named_fn = NamedFunction(id, dir_id, name)
159
+ # self.paths.add_schema_obj(dir_id, name, named_fn)
@@ -0,0 +1,200 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Optional, Union, Callable, Set
5
+
6
+ import sqlalchemy as sql
7
+ from pgvector.sqlalchemy import Vector
8
+
9
+ from pixeltable import exceptions as excs
10
+ from pixeltable.metadata import schema
11
+ from pixeltable.type_system import ColumnType, StringType
12
+ from .globals import is_valid_identifier
13
+
14
+ _logger = logging.getLogger('pixeltable')
15
+
16
+ class Column:
17
+ """Representation of a column in the schema of a Table/DataFrame.
18
+
19
+ A Column contains all the metadata necessary for executing queries and updates against a particular version of a
20
+ table/view.
21
+ """
22
+ def __init__(
23
+ self, name: str, col_type: Optional[ColumnType] = None,
24
+ computed_with: Optional[Union['Expr', Callable]] = None,
25
+ primary_key: bool = False, stored: Optional[bool] = None,
26
+ indexed: bool = False,
27
+ # these parameters aren't set by users
28
+ col_id: Optional[int] = None):
29
+ """Column constructor.
30
+
31
+ Args:
32
+ name: column name
33
+ col_type: column type; can be None if the type can be derived from ``computed_with``
34
+ computed_with: a callable or an Expr object that computes the column value
35
+ primary_key: if True, this column is part of the primary key
36
+ stored: determines whether a computed column is present in the stored table or recomputed on demand
37
+ indexed: if True, this column has a nearest neighbor index (only valid for image columns)
38
+ col_id: column ID (only used internally)
39
+
40
+ Computed columns: those have a non-None ``computed_with`` argument
41
+
42
+ - when constructed by the user: ``computed_with`` was constructed explicitly and is passed in;
43
+ col_type is None
44
+ - when loaded from md store: ``computed_with`` is set and col_type is set
45
+
46
+ ``computed_with`` is a Callable:
47
+
48
+ - the callable's parameter names must correspond to existing columns in the table for which this Column
49
+ is being used
50
+ - ``col_type`` needs to be set to the callable's return type
51
+
52
+ ``stored`` (only valid for computed image columns):
53
+
54
+ - if True: the column is present in the stored table
55
+ - if False: the column is not present in the stored table and recomputed during a query
56
+ - if None: the system chooses for you (at present, this is always False, but this may change in the future)
57
+
58
+ indexed: only valid for image columns; if true, maintains an NN index for this column
59
+ """
60
+ if not is_valid_identifier(name):
61
+ raise excs.Error(f"Invalid column name: '{name}'")
62
+ self.name = name
63
+ if col_type is None and computed_with is None:
64
+ raise excs.Error(f'Column {name}: col_type is required if computed_with is not specified')
65
+
66
+ self.value_expr: Optional['Expr'] = None
67
+ self.compute_func: Optional[Callable] = None
68
+ from pixeltable import exprs
69
+ if computed_with is not None:
70
+ value_expr = exprs.Expr.from_object(computed_with)
71
+ if value_expr is None:
72
+ # computed_with needs to be a Callable
73
+ if not isinstance(computed_with, Callable):
74
+ raise excs.Error(
75
+ f'Column {name}: computed_with needs to be either a Pixeltable expression or a Callable, '
76
+ f'but it is a {type(computed_with)}')
77
+ if col_type is None:
78
+ raise excs.Error(f'Column {name}: col_type is required if computed_with is a Callable')
79
+ # we need to turn the computed_with function into an Expr, but this requires resolving
80
+ # column name references and for that we need to wait until we're assigned to a Table
81
+ self.compute_func = computed_with
82
+ else:
83
+ self.value_expr = value_expr.copy()
84
+ self.col_type = self.value_expr.col_type
85
+
86
+ if col_type is not None:
87
+ self.col_type = col_type
88
+ assert self.col_type is not None
89
+
90
+ self.stored = stored
91
+ self.dependent_cols: Set[Column] = set() # cols with value_exprs that reference us; set by TableVersion
92
+ self.id = col_id
93
+ self.primary_key = primary_key
94
+
95
+ # column in the stored table for the values of this Column
96
+ self.sa_col: Optional[sql.schema.Column] = None
97
+
98
+ # computed cols also have storage columns for the exception string and type
99
+ self.sa_errormsg_col: Optional[sql.schema.Column] = None
100
+ self.sa_errortype_col: Optional[sql.schema.Column] = None
101
+ # indexed columns also have a column for the embeddings
102
+ self.sa_idx_col: Optional[sql.schema.Column] = None
103
+ from .table_version import TableVersion
104
+ self.tbl: Optional[TableVersion] = None # set by owning TableVersion
105
+
106
+ if indexed and not self.col_type.is_image_type():
107
+ raise excs.Error(f'Column {name}: indexed=True requires ImageType')
108
+ self.is_indexed = indexed
109
+
110
+ @classmethod
111
+ def from_md(cls, col_id: int, md: schema.SchemaColumn, tbl: 'TableVersion') -> Column:
112
+ """Construct a Column from metadata.
113
+
114
+ Leaves out value_expr, because that requires TableVersion.cols to be complete.
115
+ """
116
+ col = cls(
117
+ md.name, col_type=ColumnType.from_dict(md.col_type), primary_key=md.is_pk,
118
+ stored=md.stored, indexed=md.is_indexed, col_id=col_id)
119
+ col.tbl = tbl
120
+ return col
121
+
122
+ def __hash__(self) -> int:
123
+ assert self.tbl is not None
124
+ return hash((self.tbl.id, self.id))
125
+
126
+ def check_value_expr(self) -> None:
127
+ assert self.value_expr is not None
128
+ if self.stored == False and self.is_computed and self.has_window_fn_call():
129
+ raise excs.Error(
130
+ f'Column {self.name}: stored={self.stored} not supported for columns computed with window functions:'
131
+ f'\n{self.value_expr}')
132
+
133
+ def has_window_fn_call(self) -> bool:
134
+ if self.value_expr is None:
135
+ return False
136
+ from pixeltable import exprs
137
+ l = list(self.value_expr.subexprs(filter=lambda e: isinstance(e, exprs.FunctionCall) and e.is_window_fn_call))
138
+ return len(l) > 0
139
+
140
+ @property
141
+ def is_computed(self) -> bool:
142
+ return self.compute_func is not None or self.value_expr is not None
143
+
144
+ @property
145
+ def is_stored(self) -> bool:
146
+ """Returns True if column is materialized in the stored table."""
147
+ assert self.stored is not None
148
+ return self.stored
149
+
150
+ @property
151
+ def records_errors(self) -> bool:
152
+ """True if this column also stores error information."""
153
+ return self.is_stored and (self.is_computed or self.col_type.is_media_type())
154
+
155
+ def source(self) -> None:
156
+ """
157
+ If this is a computed col and the top-level expr is a function call, print the source, if possible.
158
+ """
159
+ from pixeltable import exprs
160
+ if self.value_expr is None or not isinstance(self.value_expr, exprs.FunctionCall):
161
+ return
162
+ self.value_expr.fn.source()
163
+
164
+ def create_sa_cols(self) -> None:
165
+ """
166
+ These need to be recreated for every new table schema version.
167
+ """
168
+ assert self.is_stored
169
+ # all storage columns are nullable (we deal with null errors in Pixeltable directly)
170
+ self.sa_col = sql.Column(self.storage_name(), self.col_type.to_sa_type(), nullable=True)
171
+ if self.is_computed or self.col_type.is_media_type():
172
+ self.sa_errormsg_col = sql.Column(self.errormsg_storage_name(), StringType().to_sa_type(), nullable=True)
173
+ self.sa_errortype_col = sql.Column(self.errortype_storage_name(), StringType().to_sa_type(), nullable=True)
174
+ if self.is_indexed:
175
+ self.sa_idx_col = sql.Column(self.index_storage_name(), Vector(512), nullable=True)
176
+
177
+ def storage_name(self) -> str:
178
+ assert self.id is not None
179
+ assert self.is_stored
180
+ return f'col_{self.id}'
181
+
182
+ def errormsg_storage_name(self) -> str:
183
+ return f'{self.storage_name()}_errormsg'
184
+
185
+ def errortype_storage_name(self) -> str:
186
+ return f'{self.storage_name()}_errortype'
187
+
188
+ def index_storage_name(self) -> str:
189
+ return f'{self.storage_name()}_idx_0'
190
+
191
+ def __str__(self) -> str:
192
+ return f'{self.name}: {self.col_type}'
193
+
194
+ def __eq__(self, other: object) -> bool:
195
+ if not isinstance(other, Column):
196
+ return False
197
+ assert self.tbl is not None
198
+ assert other.tbl is not None
199
+ return self.tbl.id == other.tbl.id and self.id == other.id
200
+
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+ import logging
5
+ from uuid import UUID
6
+
7
+ import sqlalchemy as sql
8
+
9
+ from .schema_object import SchemaObject
10
+ from pixeltable.env import Env
11
+ from pixeltable.metadata import schema
12
+
13
+
14
+ _logger = logging.getLogger('pixeltable')
15
+
16
+ class Dir(SchemaObject):
17
+ def __init__(self, id: UUID, parent_id: UUID, name: str):
18
+ super().__init__(id, name, parent_id)
19
+
20
+ @classmethod
21
+ def display_name(cls) -> str:
22
+ return 'directory'
23
+
24
+ def move(self, new_name: str, new_dir_id: UUID) -> None:
25
+ super().move(new_name, new_dir_id)
26
+ with Env.get().engine.begin() as conn:
27
+ dir_md = schema.DirMd(name=new_name)
28
+ conn.execute(
29
+ sql.update(schema.Dir.__table__)
30
+ .values({schema.Dir.parent_id: self._dir_id, schema.Dir.md: dataclasses.asdict(dir_md)})
31
+ .where(schema.Dir.id == self._id))
32
+