pixeltable 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -3
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +2 -1
- pixeltable/catalog/catalog.py +63 -36
- pixeltable/catalog/column.py +11 -4
- pixeltable/catalog/dir.py +5 -5
- pixeltable/catalog/globals.py +28 -14
- pixeltable/catalog/insertable_table.py +81 -43
- pixeltable/catalog/path.py +2 -2
- pixeltable/catalog/table.py +140 -109
- pixeltable/catalog/table_version.py +60 -43
- pixeltable/catalog/table_version_handle.py +3 -0
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/view.py +17 -9
- pixeltable/dataframe.py +5 -3
- pixeltable/env.py +109 -43
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/aggregation_node.py +6 -8
- pixeltable/exec/cache_prefetch_node.py +4 -7
- pixeltable/exec/component_iteration_node.py +1 -3
- pixeltable/exec/data_row_batch.py +1 -2
- pixeltable/exec/exec_context.py +1 -1
- pixeltable/exec/exec_node.py +2 -3
- pixeltable/exec/expr_eval/__init__.py +2 -0
- pixeltable/exec/expr_eval/evaluators.py +137 -20
- pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
- pixeltable/exec/expr_eval/globals.py +68 -7
- pixeltable/exec/expr_eval/schedulers.py +25 -23
- pixeltable/exec/in_memory_data_node.py +8 -6
- pixeltable/exec/row_update_node.py +3 -4
- pixeltable/exec/sql_node.py +16 -17
- pixeltable/exprs/__init__.py +3 -2
- pixeltable/exprs/arithmetic_expr.py +2 -0
- pixeltable/exprs/column_property_ref.py +1 -1
- pixeltable/exprs/column_ref.py +39 -3
- pixeltable/exprs/compound_predicate.py +1 -1
- pixeltable/exprs/data_row.py +17 -1
- pixeltable/exprs/expr.py +51 -21
- pixeltable/exprs/function_call.py +34 -2
- pixeltable/exprs/globals.py +12 -0
- pixeltable/exprs/json_mapper.py +95 -48
- pixeltable/exprs/json_path.py +3 -10
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +33 -6
- pixeltable/exprs/similarity_expr.py +6 -21
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/string_op.py +107 -0
- pixeltable/ext/__init__.py +1 -1
- pixeltable/ext/functions/__init__.py +1 -1
- pixeltable/ext/functions/whisperx.py +1 -1
- pixeltable/ext/functions/yolox.py +22 -65
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/callable_function.py +2 -5
- pixeltable/func/expr_template_function.py +22 -2
- pixeltable/func/function.py +4 -5
- pixeltable/func/function_registry.py +1 -1
- pixeltable/func/signature.py +1 -1
- pixeltable/func/tools.py +2 -2
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/anthropic.py +2 -2
- pixeltable/functions/audio.py +1 -1
- pixeltable/functions/deepseek.py +1 -1
- pixeltable/functions/fireworks.py +1 -1
- pixeltable/functions/globals.py +22 -11
- pixeltable/functions/huggingface.py +1 -1
- pixeltable/functions/image.py +1 -1
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +1 -1
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/mistralai.py +1 -1
- pixeltable/functions/ollama.py +1 -1
- pixeltable/functions/openai.py +2 -2
- pixeltable/functions/replicate.py +1 -1
- pixeltable/functions/string.py +1 -1
- pixeltable/functions/timestamp.py +1 -1
- pixeltable/functions/together.py +1 -1
- pixeltable/functions/util.py +1 -1
- pixeltable/functions/video.py +2 -2
- pixeltable/functions/vision.py +2 -2
- pixeltable/globals.py +85 -33
- pixeltable/index/embedding_index.py +12 -1
- pixeltable/io/__init__.py +8 -5
- pixeltable/io/datarows.py +138 -0
- pixeltable/io/external_store.py +8 -5
- pixeltable/io/fiftyone.py +6 -7
- pixeltable/io/globals.py +7 -160
- pixeltable/io/hf_datasets.py +21 -98
- pixeltable/io/label_studio.py +21 -20
- pixeltable/io/pandas.py +35 -48
- pixeltable/io/parquet.py +17 -42
- pixeltable/io/table_data_conduit.py +569 -0
- pixeltable/io/utils.py +6 -21
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/metadata/__init__.py +6 -4
- pixeltable/metadata/converters/convert_24.py +3 -3
- pixeltable/metadata/converters/convert_25.py +1 -1
- pixeltable/metadata/converters/convert_29.py +1 -1
- pixeltable/metadata/converters/convert_30.py +50 -0
- pixeltable/metadata/converters/util.py +26 -1
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +3 -0
- pixeltable/store.py +2 -2
- pixeltable/type_system.py +19 -7
- pixeltable/utils/arrow.py +32 -7
- pixeltable/utils/console_output.py +3 -2
- pixeltable/utils/coroutine.py +3 -3
- pixeltable/utils/dbms.py +66 -0
- pixeltable/utils/documents.py +61 -67
- pixeltable/utils/filecache.py +1 -1
- pixeltable/utils/http_server.py +3 -2
- pixeltable/utils/pytorch.py +1 -1
- pixeltable/utils/sql.py +1 -1
- pixeltable-0.3.11.dist-info/METADATA +436 -0
- pixeltable-0.3.11.dist-info/RECORD +179 -0
- {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +1 -1
- pixeltable/catalog/path_dict.py +0 -169
- pixeltable-0.3.9.dist-info/METADATA +0 -382
- pixeltable-0.3.9.dist-info/RECORD +0 -175
- {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0
|
@@ -80,7 +80,7 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
|
|
|
80
80
|
rolled_kwargs = kwargs.pop(param['name'])
|
|
81
81
|
|
|
82
82
|
if rolled_args is not None:
|
|
83
|
-
assert rolled_args['_classname'] in
|
|
83
|
+
assert rolled_args['_classname'] in ('InlineArray', 'InlineList')
|
|
84
84
|
new_args.extend(rolled_args['components'])
|
|
85
85
|
if rolled_kwargs is not None:
|
|
86
86
|
assert rolled_kwargs['_classname'] == 'InlineDict'
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import (
|
|
7
|
+
convert_table_record,
|
|
8
|
+
convert_table_schema_version_record,
|
|
9
|
+
convert_table_version_record,
|
|
10
|
+
)
|
|
11
|
+
from pixeltable.metadata.schema import Table, TableSchemaVersion, TableVersion
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_converter(version=30)
|
|
15
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
16
|
+
convert_table_record(engine, table_record_updater=__update_table_record)
|
|
17
|
+
convert_table_version_record(engine, table_version_record_updater=__update_table_version_record)
|
|
18
|
+
convert_table_schema_version_record(
|
|
19
|
+
engine, table_schema_version_record_updater=__update_table_schema_version_record
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def __update_table_record(record: Table) -> None:
|
|
24
|
+
"""
|
|
25
|
+
Update TableMd with table_id
|
|
26
|
+
"""
|
|
27
|
+
assert isinstance(record.md, dict)
|
|
28
|
+
md = copy.copy(record.md)
|
|
29
|
+
md['tbl_id'] = str(record.id)
|
|
30
|
+
record.md = md
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def __update_table_version_record(record: TableVersion) -> None:
|
|
34
|
+
"""
|
|
35
|
+
Update TableVersion with table_id.
|
|
36
|
+
"""
|
|
37
|
+
assert isinstance(record.md, dict)
|
|
38
|
+
md = copy.copy(record.md)
|
|
39
|
+
md['tbl_id'] = str(record.tbl_id)
|
|
40
|
+
record.md = md
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def __update_table_schema_version_record(record: TableSchemaVersion) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Update TableSchemaVersion with table_id.
|
|
46
|
+
"""
|
|
47
|
+
assert isinstance(record.md, dict)
|
|
48
|
+
md = copy.copy(record.md)
|
|
49
|
+
md['tbl_id'] = str(record.tbl_id)
|
|
50
|
+
record.md = md
|
|
@@ -5,7 +5,7 @@ from uuid import UUID
|
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
|
|
8
|
-
from pixeltable.metadata.schema import Function, Table, TableSchemaVersion
|
|
8
|
+
from pixeltable.metadata.schema import Function, Table, TableSchemaVersion, TableVersion
|
|
9
9
|
|
|
10
10
|
__logger = logging.getLogger('pixeltable')
|
|
11
11
|
|
|
@@ -143,3 +143,28 @@ def __update_schema_column(table_schema_version_md: dict, schema_column_updater:
|
|
|
143
143
|
assert isinstance(cols, dict)
|
|
144
144
|
for schema_col in cols.values():
|
|
145
145
|
schema_column_updater(schema_col)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def convert_table_record(engine: sql.engine.Engine, table_record_updater: Optional[Callable[[Table], None]]) -> None:
|
|
149
|
+
with sql.orm.Session(engine, future=True) as session:
|
|
150
|
+
for record in session.query(Table).all():
|
|
151
|
+
table_record_updater(record)
|
|
152
|
+
session.commit()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def convert_table_version_record(
|
|
156
|
+
engine: sql.engine.Engine, table_version_record_updater: Optional[Callable[[TableVersion], None]]
|
|
157
|
+
) -> None:
|
|
158
|
+
with sql.orm.Session(engine, future=True) as session:
|
|
159
|
+
for record in session.query(TableVersion).all():
|
|
160
|
+
table_version_record_updater(record)
|
|
161
|
+
session.commit()
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def convert_table_schema_version_record(
|
|
165
|
+
engine: sql.engine.Engine, table_schema_version_record_updater: Optional[Callable[[TableSchemaVersion], None]]
|
|
166
|
+
) -> None:
|
|
167
|
+
with sql.orm.Session(engine, future=True) as session:
|
|
168
|
+
for record in session.query(TableSchemaVersion).all():
|
|
169
|
+
table_schema_version_record_updater(record)
|
|
170
|
+
session.commit()
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
31: 'Add table ids to metadata structs',
|
|
5
6
|
30: 'Store default values and constant arguments as literals',
|
|
6
7
|
29: 'Add user and additional_md fields to metadata structs',
|
|
7
8
|
28: 'Enable view creation from DataFrame with select clause',
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -153,6 +153,7 @@ class ViewMd:
|
|
|
153
153
|
|
|
154
154
|
@dataclasses.dataclass
|
|
155
155
|
class TableMd:
|
|
156
|
+
tbl_id: str # uuid.UUID
|
|
156
157
|
name: str
|
|
157
158
|
|
|
158
159
|
user: Optional[str]
|
|
@@ -199,6 +200,7 @@ class Table(Base):
|
|
|
199
200
|
|
|
200
201
|
@dataclasses.dataclass
|
|
201
202
|
class TableVersionMd:
|
|
203
|
+
tbl_id: str # uuid.UUID
|
|
202
204
|
created_at: float # time.time()
|
|
203
205
|
version: int
|
|
204
206
|
schema_version: int
|
|
@@ -234,6 +236,7 @@ class TableSchemaVersionMd:
|
|
|
234
236
|
Records all versioned table metadata.
|
|
235
237
|
"""
|
|
236
238
|
|
|
239
|
+
tbl_id: str # uuid.UUID
|
|
237
240
|
schema_version: int
|
|
238
241
|
preceding_schema_version: Optional[int]
|
|
239
242
|
columns: dict[int, SchemaColumn] # col_id -> SchemaColumn
|
pixeltable/store.py
CHANGED
|
@@ -99,9 +99,9 @@ class StoreBase:
|
|
|
99
99
|
|
|
100
100
|
# v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
|
|
101
101
|
idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
|
|
102
|
-
idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=
|
|
102
|
+
idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
|
|
103
103
|
idx_name = f'vmax_idx_{self.tbl_version.id.hex}'
|
|
104
|
-
idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=
|
|
104
|
+
idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
|
|
105
105
|
|
|
106
106
|
self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
|
|
107
107
|
|
pixeltable/type_system.py
CHANGED
|
@@ -512,7 +512,7 @@ class StringType(ColumnType):
|
|
|
512
512
|
def __init__(self, nullable: bool = False):
|
|
513
513
|
super().__init__(self.Type.STRING, nullable=nullable)
|
|
514
514
|
|
|
515
|
-
def has_supertype(self):
|
|
515
|
+
def has_supertype(self) -> bool:
|
|
516
516
|
return not self.nullable
|
|
517
517
|
|
|
518
518
|
@classmethod
|
|
@@ -602,7 +602,7 @@ class TimestampType(ColumnType):
|
|
|
602
602
|
def __init__(self, nullable: bool = False):
|
|
603
603
|
super().__init__(self.Type.TIMESTAMP, nullable=nullable)
|
|
604
604
|
|
|
605
|
-
def has_supertype(self):
|
|
605
|
+
def has_supertype(self) -> bool:
|
|
606
606
|
return not self.nullable
|
|
607
607
|
|
|
608
608
|
@classmethod
|
|
@@ -768,7 +768,7 @@ class JsonType(ColumnType):
|
|
|
768
768
|
a_type = a.get('type')
|
|
769
769
|
b_type = b.get('type')
|
|
770
770
|
|
|
771
|
-
if a_type in
|
|
771
|
+
if a_type in ('string', 'integer', 'number', 'boolean', 'object', 'array') and a_type == b_type:
|
|
772
772
|
# a and b both have the same type designation, but are not identical. This can happen if
|
|
773
773
|
# (for example) they have validators or other attributes that differ. In this case, we
|
|
774
774
|
# generalize to {'type': t}, where t is their shared type, with no other qualifications.
|
|
@@ -1170,6 +1170,20 @@ class DocumentType(ColumnType):
|
|
|
1170
1170
|
XML = 3
|
|
1171
1171
|
TXT = 4
|
|
1172
1172
|
|
|
1173
|
+
@classmethod
|
|
1174
|
+
def from_extension(cls, ext: str) -> Optional['DocumentType.DocumentFormat']:
|
|
1175
|
+
if ext in ('.htm', '.html'):
|
|
1176
|
+
return cls.HTML
|
|
1177
|
+
if ext == '.md':
|
|
1178
|
+
return cls.MD
|
|
1179
|
+
if ext == '.pdf':
|
|
1180
|
+
return cls.PDF
|
|
1181
|
+
if ext == '.xml':
|
|
1182
|
+
return cls.XML
|
|
1183
|
+
if ext == '.txt':
|
|
1184
|
+
return cls.TXT
|
|
1185
|
+
return None
|
|
1186
|
+
|
|
1173
1187
|
def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
|
|
1174
1188
|
super().__init__(self.Type.DOCUMENT, nullable=nullable)
|
|
1175
1189
|
self.doc_formats = doc_formats
|
|
@@ -1203,9 +1217,7 @@ class DocumentType(ColumnType):
|
|
|
1203
1217
|
assert isinstance(val, str)
|
|
1204
1218
|
from pixeltable.utils.documents import get_document_handle
|
|
1205
1219
|
|
|
1206
|
-
|
|
1207
|
-
if dh is None:
|
|
1208
|
-
raise excs.Error(f'Not a recognized document format: {val}')
|
|
1220
|
+
_ = get_document_handle(val)
|
|
1209
1221
|
|
|
1210
1222
|
|
|
1211
1223
|
T = typing.TypeVar('T')
|
|
@@ -1240,7 +1252,7 @@ class _PxtType:
|
|
|
1240
1252
|
`ColumnType`.
|
|
1241
1253
|
"""
|
|
1242
1254
|
|
|
1243
|
-
def __init__(self):
|
|
1255
|
+
def __init__(self) -> None:
|
|
1244
1256
|
raise TypeError(f'Type `{type(self)}` cannot be instantiated.')
|
|
1245
1257
|
|
|
1246
1258
|
@classmethod
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -11,14 +11,19 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
|
11
11
|
pa.large_string(): ts.StringType(nullable=True),
|
|
12
12
|
pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
|
|
13
13
|
pa.bool_(): ts.BoolType(nullable=True),
|
|
14
|
-
pa.uint8(): ts.IntType(nullable=True),
|
|
15
14
|
pa.int8(): ts.IntType(nullable=True),
|
|
16
|
-
pa.
|
|
17
|
-
pa.uint64(): ts.IntType(nullable=True),
|
|
15
|
+
pa.int16(): ts.IntType(nullable=True),
|
|
18
16
|
pa.int32(): ts.IntType(nullable=True),
|
|
19
17
|
pa.int64(): ts.IntType(nullable=True),
|
|
18
|
+
pa.uint8(): ts.IntType(nullable=True),
|
|
19
|
+
pa.uint16(): ts.IntType(nullable=True),
|
|
20
|
+
pa.uint32(): ts.IntType(nullable=True),
|
|
21
|
+
pa.uint64(): ts.IntType(nullable=True),
|
|
20
22
|
pa.float32(): ts.FloatType(nullable=True),
|
|
21
23
|
pa.float64(): ts.FloatType(nullable=True),
|
|
24
|
+
pa.date32(): ts.StringType(nullable=True), # date32 is not supported in pixeltable, use string
|
|
25
|
+
pa.date64(): ts.StringType(nullable=True), # date64 is not supported in pixeltable, use string
|
|
26
|
+
pa.binary(): None, # cannot import binary (inline image)
|
|
22
27
|
}
|
|
23
28
|
|
|
24
29
|
PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
|
|
@@ -43,7 +48,7 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
|
|
|
43
48
|
return ts.TimestampType(nullable=nullable)
|
|
44
49
|
elif arrow_type in PA_TO_PXT_TYPES:
|
|
45
50
|
pt = PA_TO_PXT_TYPES[arrow_type]
|
|
46
|
-
return pt.copy(nullable=nullable)
|
|
51
|
+
return pt.copy(nullable=nullable) if pt is not None else None
|
|
47
52
|
elif isinstance(arrow_type, pa.FixedShapeTensorType):
|
|
48
53
|
dtype = to_pixeltable_type(arrow_type.value_type, nullable)
|
|
49
54
|
if dtype is None:
|
|
@@ -111,6 +116,28 @@ def iter_tuples(batch: Union[pa.Table, pa.RecordBatch]) -> Iterator[dict[str, An
|
|
|
111
116
|
yield {col_name: values[i] for col_name, values in pydict.items()}
|
|
112
117
|
|
|
113
118
|
|
|
119
|
+
def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
|
|
120
|
+
"""Convert a value to insertable format"""
|
|
121
|
+
if val is None:
|
|
122
|
+
return None
|
|
123
|
+
if pxt_type.is_float_type():
|
|
124
|
+
return float(val)
|
|
125
|
+
elif pxt_type.is_int_type():
|
|
126
|
+
return int(val)
|
|
127
|
+
elif pxt_type.is_bool_type():
|
|
128
|
+
return bool(val)
|
|
129
|
+
elif pxt_type.is_string_type():
|
|
130
|
+
return str(val)
|
|
131
|
+
elif pxt_type.is_timestamp_type():
|
|
132
|
+
if isinstance(val, str):
|
|
133
|
+
return datetime.datetime.fromisoformat(val)
|
|
134
|
+
if isinstance(val, datetime.datetime):
|
|
135
|
+
return val
|
|
136
|
+
elif pxt_type.is_array_type():
|
|
137
|
+
return pxt_type.create_literal(val)
|
|
138
|
+
raise ValueError(f'Unsupported type {pxt_type} for value {val}')
|
|
139
|
+
|
|
140
|
+
|
|
114
141
|
def iter_tuples2(
|
|
115
142
|
batch: Union[pa.Table, pa.RecordBatch], col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
|
|
116
143
|
) -> Iterator[dict[str, Any]]:
|
|
@@ -124,8 +151,6 @@ def iter_tuples2(
|
|
|
124
151
|
for i in range(batch_size):
|
|
125
152
|
# Convert a row to insertable format
|
|
126
153
|
yield {
|
|
127
|
-
(pxt_name := col_name
|
|
128
|
-
values[i]
|
|
129
|
-
)
|
|
154
|
+
(pxt_name := col_mapping.get(col_name, col_name)): _ar_val_to_pxt_val(values[i], schema[pxt_name])
|
|
130
155
|
for col_name, values in pydict.items()
|
|
131
156
|
}
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from typing import TextIO
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
def map_level(verbosity: int) -> int:
|
|
@@ -22,10 +23,10 @@ def map_level(verbosity: int) -> int:
|
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class ConsoleOutputHandler(logging.StreamHandler):
|
|
25
|
-
def __init__(self, stream):
|
|
26
|
+
def __init__(self, stream: TextIO):
|
|
26
27
|
super().__init__(stream)
|
|
27
28
|
|
|
28
|
-
def emit(self, record):
|
|
29
|
+
def emit(self, record: logging.LogRecord) -> None:
|
|
29
30
|
if record.msg.endswith('\n'):
|
|
30
31
|
self.stream.write(record.msg)
|
|
31
32
|
else:
|
pixeltable/utils/coroutine.py
CHANGED
|
@@ -7,8 +7,8 @@ T = TypeVar('T')
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
# TODO This is a temporary hack to be able to run async UDFs in contexts that are not properly handled by the existing
|
|
10
|
-
# scheduler logic (e.g.,
|
|
11
|
-
# removed.
|
|
10
|
+
# scheduler logic (e.g., as an embedding function as part of a similarity lookup). Once the scheduler is fully
|
|
11
|
+
# general, it can be removed.
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: float = 30) -> T:
|
|
@@ -16,7 +16,7 @@ def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: floa
|
|
|
16
16
|
Runs the given coroutine synchronously, even if called in the context of a running event loop.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
-
def run_in_new_loop():
|
|
19
|
+
def run_in_new_loop() -> T:
|
|
20
20
|
new_loop = asyncio.new_event_loop()
|
|
21
21
|
asyncio.set_event_loop(new_loop)
|
|
22
22
|
try:
|
pixeltable/utils/dbms.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
|
|
3
|
+
from sqlalchemy import URL
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Dbms(abc.ABC):
|
|
7
|
+
"""
|
|
8
|
+
Provides abstractions for utilities to interact with a database system.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
name: str
|
|
12
|
+
transaction_isolation_level: str
|
|
13
|
+
version_index_type: str
|
|
14
|
+
db_url: URL
|
|
15
|
+
|
|
16
|
+
def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: URL) -> None:
|
|
17
|
+
self.name = name
|
|
18
|
+
self.transaction_isolation_level = transaction_isolation_level
|
|
19
|
+
self.version_index_type = version_index_type
|
|
20
|
+
self.db_url = db_url
|
|
21
|
+
|
|
22
|
+
@abc.abstractmethod
|
|
23
|
+
def drop_db_stmt(self, database: str) -> str: ...
|
|
24
|
+
|
|
25
|
+
@abc.abstractmethod
|
|
26
|
+
def create_db_stmt(self, database: str) -> str: ...
|
|
27
|
+
|
|
28
|
+
@abc.abstractmethod
|
|
29
|
+
def default_system_db_url(self) -> str: ...
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PostgresqlDbms(Dbms):
|
|
33
|
+
"""
|
|
34
|
+
Implements utilities to interact with Postgres database.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, db_url: URL):
|
|
38
|
+
super().__init__('postgresql', 'REPEATABLE READ', 'brin', db_url)
|
|
39
|
+
|
|
40
|
+
def drop_db_stmt(self, database: str) -> str:
|
|
41
|
+
return f'DROP DATABASE {database}'
|
|
42
|
+
|
|
43
|
+
def create_db_stmt(self, database: str) -> str:
|
|
44
|
+
return f"CREATE DATABASE {database} ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
|
|
45
|
+
|
|
46
|
+
def default_system_db_url(self) -> str:
|
|
47
|
+
a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
|
|
48
|
+
return a
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class CockroachDbms(Dbms):
|
|
52
|
+
"""
|
|
53
|
+
Implements utilities to interact with CockroachDb database.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, db_url: URL):
|
|
57
|
+
super().__init__('cockroachdb', 'SERIALIZABLE', 'btree', db_url)
|
|
58
|
+
|
|
59
|
+
def drop_db_stmt(self, database: str) -> str:
|
|
60
|
+
return f'DROP DATABASE {database} CASCADE'
|
|
61
|
+
|
|
62
|
+
def create_db_stmt(self, database: str) -> str:
|
|
63
|
+
return f"CREATE DATABASE {database} TEMPLATE template0 ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C'"
|
|
64
|
+
|
|
65
|
+
def default_system_db_url(self) -> str:
|
|
66
|
+
return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
|
pixeltable/utils/documents.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
+
import os
|
|
2
3
|
from typing import Optional
|
|
3
4
|
|
|
4
5
|
import bs4
|
|
5
6
|
import fitz # type: ignore[import-untyped]
|
|
6
7
|
import puremagic
|
|
7
8
|
|
|
8
|
-
import
|
|
9
|
+
from pixeltable import exceptions as excs, type_system as ts
|
|
9
10
|
from pixeltable.env import Env
|
|
10
11
|
|
|
11
12
|
|
|
@@ -18,85 +19,78 @@ class DocumentHandle:
|
|
|
18
19
|
txt_doc: Optional[str] = None
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
def get_document_handle(path: str) ->
|
|
22
|
-
|
|
22
|
+
def get_document_handle(path: str) -> DocumentHandle:
|
|
23
|
+
_, extension = os.path.splitext(path)
|
|
24
|
+
handle = get_handle_by_extension(path, extension)
|
|
25
|
+
if handle is not None:
|
|
26
|
+
return handle
|
|
23
27
|
|
|
24
|
-
if
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
# if no extension, use puremagic to determine the type
|
|
29
|
+
extension = puremagic.from_file(path)
|
|
30
|
+
handle = get_handle_by_extension(path, extension)
|
|
31
|
+
if handle is not None:
|
|
32
|
+
return handle
|
|
28
33
|
|
|
29
|
-
|
|
30
|
-
bs_doc = get_html_handle(path)
|
|
31
|
-
if bs_doc is not None:
|
|
32
|
-
return DocumentHandle(format=ts.DocumentType.DocumentFormat.HTML, bs_doc=bs_doc)
|
|
34
|
+
raise excs.Error(f'Unrecognized document format: {path}')
|
|
33
35
|
|
|
34
|
-
if doc_format == '.md':
|
|
35
|
-
md_ast = get_markdown_handle(path)
|
|
36
|
-
if md_ast is not None:
|
|
37
|
-
return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
|
|
38
36
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
if bs_doc is not None:
|
|
42
|
-
return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
|
|
37
|
+
def get_handle_by_extension(path: str, extension: str) -> Optional[DocumentHandle]:
|
|
38
|
+
doc_format = ts.DocumentType.DocumentFormat.from_extension(extension)
|
|
43
39
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
40
|
+
try:
|
|
41
|
+
if doc_format == ts.DocumentType.DocumentFormat.HTML:
|
|
42
|
+
return DocumentHandle(doc_format, bs_doc=get_html_handle(path))
|
|
43
|
+
if doc_format == ts.DocumentType.DocumentFormat.MD:
|
|
44
|
+
return DocumentHandle(doc_format, md_ast=get_markdown_handle(path))
|
|
45
|
+
if doc_format == ts.DocumentType.DocumentFormat.PDF:
|
|
46
|
+
return DocumentHandle(doc_format, pdf_doc=get_pdf_handle(path))
|
|
47
|
+
if doc_format == ts.DocumentType.DocumentFormat.XML:
|
|
48
|
+
return DocumentHandle(doc_format, bs_doc=get_xml_handle(path))
|
|
49
|
+
if doc_format == ts.DocumentType.DocumentFormat.TXT:
|
|
50
|
+
return DocumentHandle(doc_format, txt_doc=get_txt(path))
|
|
51
|
+
except Exception as exc:
|
|
52
|
+
raise excs.Error(f'An error occurred processing a {doc_format} document: {path}') from exc
|
|
48
53
|
|
|
49
54
|
return None
|
|
50
55
|
|
|
51
56
|
|
|
52
|
-
def
|
|
53
|
-
|
|
54
|
-
doc =
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# try to read one page
|
|
59
|
-
next(page for page in doc)
|
|
60
|
-
return doc
|
|
61
|
-
except Exception:
|
|
62
|
-
return None
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
|
|
66
|
-
try:
|
|
67
|
-
with open(path, 'r', encoding='utf8') as fp:
|
|
68
|
-
doc = bs4.BeautifulSoup(fp, 'lxml')
|
|
69
|
-
return doc if doc.find() is not None else None
|
|
70
|
-
except Exception:
|
|
71
|
-
return None
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
|
|
75
|
-
try:
|
|
76
|
-
with open(path, 'r', encoding='utf8') as fp:
|
|
77
|
-
doc = bs4.BeautifulSoup(fp, 'xml')
|
|
78
|
-
return doc if doc.find() is not None else None
|
|
79
|
-
except Exception:
|
|
80
|
-
return None
|
|
57
|
+
def get_html_handle(path: str) -> bs4.BeautifulSoup:
|
|
58
|
+
with open(path, 'r', encoding='utf8') as fp:
|
|
59
|
+
doc = bs4.BeautifulSoup(fp, 'lxml')
|
|
60
|
+
if doc.find() is None:
|
|
61
|
+
raise excs.Error(f'Not a valid HTML document: {path}')
|
|
62
|
+
return doc
|
|
81
63
|
|
|
82
64
|
|
|
83
|
-
def get_markdown_handle(path: str) ->
|
|
65
|
+
def get_markdown_handle(path: str) -> dict:
|
|
84
66
|
Env.get().require_package('mistune', [3, 0])
|
|
85
67
|
import mistune
|
|
86
68
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
return md_ast(text)
|
|
92
|
-
except Exception:
|
|
93
|
-
return None
|
|
69
|
+
with open(path, encoding='utf8') as file:
|
|
70
|
+
text = file.read()
|
|
71
|
+
md_ast = mistune.create_markdown(renderer=None)
|
|
72
|
+
return md_ast(text)
|
|
94
73
|
|
|
95
74
|
|
|
96
|
-
def
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
75
|
+
def get_pdf_handle(path: str) -> fitz.Document:
|
|
76
|
+
doc = fitz.open(path)
|
|
77
|
+
# check pdf (bc it will work for images)
|
|
78
|
+
if not doc.is_pdf:
|
|
79
|
+
raise excs.Error(f'Not a valid PDF document: {path}')
|
|
80
|
+
# try to read one page
|
|
81
|
+
next(page for page in doc)
|
|
82
|
+
return doc
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_xml_handle(path: str) -> bs4.BeautifulSoup:
|
|
86
|
+
with open(path, 'r', encoding='utf8') as fp:
|
|
87
|
+
doc = bs4.BeautifulSoup(fp, 'xml')
|
|
88
|
+
if doc.find() is None:
|
|
89
|
+
raise excs.Error(f'Not a valid XML document: {path}')
|
|
90
|
+
return doc
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_txt(path: str) -> str:
|
|
94
|
+
with open(path, 'r', encoding='utf-8') as fp:
|
|
95
|
+
doc = fp.read()
|
|
96
|
+
return doc
|
pixeltable/utils/filecache.py
CHANGED
|
@@ -102,7 +102,7 @@ class FileCache:
|
|
|
102
102
|
def init(cls) -> None:
|
|
103
103
|
cls.__instance = cls()
|
|
104
104
|
|
|
105
|
-
def __init__(self):
|
|
105
|
+
def __init__(self) -> None:
|
|
106
106
|
self.cache = OrderedDict()
|
|
107
107
|
self.total_size = 0
|
|
108
108
|
self.capacity_bytes = int(Env.get()._file_cache_size_g * (1 << 30))
|
pixeltable/utils/http_server.py
CHANGED
|
@@ -3,6 +3,7 @@ import http.server
|
|
|
3
3
|
import logging
|
|
4
4
|
import pathlib
|
|
5
5
|
import urllib
|
|
6
|
+
from typing import Any
|
|
6
7
|
|
|
7
8
|
_logger = logging.getLogger('pixeltable.http.server')
|
|
8
9
|
|
|
@@ -38,7 +39,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
38
39
|
path = pathlib.Path(urllib.request.url2pathname(path))
|
|
39
40
|
return str(path)
|
|
40
41
|
|
|
41
|
-
def log_message(self, format, *args) -> None:
|
|
42
|
+
def log_message(self, format: str, *args: Any) -> None:
|
|
42
43
|
"""override logging to stderr in http.server.BaseHTTPRequestHandler"""
|
|
43
44
|
message = format % args
|
|
44
45
|
_logger.info(message.translate(self._control_char_table)) # type: ignore[attr-defined]
|
|
@@ -47,7 +48,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
|
|
|
47
48
|
class LoggingHTTPServer(http.server.ThreadingHTTPServer):
|
|
48
49
|
"""Avoids polluting stdout and stderr"""
|
|
49
50
|
|
|
50
|
-
def handle_error(self, request, client_address) -> None:
|
|
51
|
+
def handle_error(self, request, client_address) -> None: # type: ignore[no-untyped-def]
|
|
51
52
|
"""override socketserver.TCPServer.handle_error which prints directly to sys.stderr"""
|
|
52
53
|
import traceback
|
|
53
54
|
|
pixeltable/utils/pytorch.py
CHANGED
|
@@ -32,7 +32,7 @@ class PixeltablePytorchDataset(torch.utils.data.IterableDataset):
|
|
|
32
32
|
|
|
33
33
|
self.path = path
|
|
34
34
|
self.image_format = image_format
|
|
35
|
-
assert image_format in
|
|
35
|
+
assert image_format in ('np', 'pt')
|
|
36
36
|
column_type_path = path / '.pixeltable.column_types.json'
|
|
37
37
|
assert column_type_path.exists(), f'missing {column_type_path}'
|
|
38
38
|
with column_type_path.open() as f:
|
pixeltable/utils/sql.py
CHANGED
|
@@ -4,7 +4,7 @@ import sqlalchemy as sql
|
|
|
4
4
|
from sqlalchemy.dialects import postgresql
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
def log_stmt(logger: logging.Logger, stmt) -> None:
|
|
7
|
+
def log_stmt(logger: logging.Logger, stmt: sql.sql.ClauseElement) -> None:
|
|
8
8
|
logger.debug(f'executing {stmt.compile(dialect=postgresql.dialect())}')
|
|
9
9
|
|
|
10
10
|
|