pixeltable 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (122) hide show
  1. pixeltable/__init__.py +2 -3
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +2 -1
  4. pixeltable/catalog/catalog.py +63 -36
  5. pixeltable/catalog/column.py +11 -4
  6. pixeltable/catalog/dir.py +5 -5
  7. pixeltable/catalog/globals.py +28 -14
  8. pixeltable/catalog/insertable_table.py +81 -43
  9. pixeltable/catalog/path.py +2 -2
  10. pixeltable/catalog/table.py +140 -109
  11. pixeltable/catalog/table_version.py +60 -43
  12. pixeltable/catalog/table_version_handle.py +3 -0
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/view.py +17 -9
  15. pixeltable/dataframe.py +5 -3
  16. pixeltable/env.py +109 -43
  17. pixeltable/exec/__init__.py +2 -0
  18. pixeltable/exec/aggregation_node.py +6 -8
  19. pixeltable/exec/cache_prefetch_node.py +4 -7
  20. pixeltable/exec/component_iteration_node.py +1 -3
  21. pixeltable/exec/data_row_batch.py +1 -2
  22. pixeltable/exec/exec_context.py +1 -1
  23. pixeltable/exec/exec_node.py +2 -3
  24. pixeltable/exec/expr_eval/__init__.py +2 -0
  25. pixeltable/exec/expr_eval/evaluators.py +137 -20
  26. pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
  27. pixeltable/exec/expr_eval/globals.py +68 -7
  28. pixeltable/exec/expr_eval/schedulers.py +25 -23
  29. pixeltable/exec/in_memory_data_node.py +8 -6
  30. pixeltable/exec/row_update_node.py +3 -4
  31. pixeltable/exec/sql_node.py +16 -17
  32. pixeltable/exprs/__init__.py +3 -2
  33. pixeltable/exprs/arithmetic_expr.py +2 -0
  34. pixeltable/exprs/column_property_ref.py +1 -1
  35. pixeltable/exprs/column_ref.py +39 -3
  36. pixeltable/exprs/compound_predicate.py +1 -1
  37. pixeltable/exprs/data_row.py +17 -1
  38. pixeltable/exprs/expr.py +51 -21
  39. pixeltable/exprs/function_call.py +34 -2
  40. pixeltable/exprs/globals.py +12 -0
  41. pixeltable/exprs/json_mapper.py +95 -48
  42. pixeltable/exprs/json_path.py +3 -10
  43. pixeltable/exprs/method_ref.py +2 -2
  44. pixeltable/exprs/object_ref.py +2 -2
  45. pixeltable/exprs/row_builder.py +33 -6
  46. pixeltable/exprs/similarity_expr.py +6 -21
  47. pixeltable/exprs/sql_element_cache.py +1 -1
  48. pixeltable/exprs/string_op.py +107 -0
  49. pixeltable/ext/__init__.py +1 -1
  50. pixeltable/ext/functions/__init__.py +1 -1
  51. pixeltable/ext/functions/whisperx.py +1 -1
  52. pixeltable/ext/functions/yolox.py +22 -65
  53. pixeltable/func/aggregate_function.py +1 -1
  54. pixeltable/func/callable_function.py +2 -5
  55. pixeltable/func/expr_template_function.py +22 -2
  56. pixeltable/func/function.py +4 -5
  57. pixeltable/func/function_registry.py +1 -1
  58. pixeltable/func/signature.py +1 -1
  59. pixeltable/func/tools.py +2 -2
  60. pixeltable/func/udf.py +2 -2
  61. pixeltable/functions/__init__.py +2 -2
  62. pixeltable/functions/anthropic.py +2 -2
  63. pixeltable/functions/audio.py +1 -1
  64. pixeltable/functions/deepseek.py +1 -1
  65. pixeltable/functions/fireworks.py +1 -1
  66. pixeltable/functions/globals.py +22 -11
  67. pixeltable/functions/huggingface.py +1 -1
  68. pixeltable/functions/image.py +1 -1
  69. pixeltable/functions/json.py +1 -1
  70. pixeltable/functions/llama_cpp.py +1 -1
  71. pixeltable/functions/math.py +1 -1
  72. pixeltable/functions/mistralai.py +1 -1
  73. pixeltable/functions/ollama.py +1 -1
  74. pixeltable/functions/openai.py +2 -2
  75. pixeltable/functions/replicate.py +1 -1
  76. pixeltable/functions/string.py +1 -1
  77. pixeltable/functions/timestamp.py +1 -1
  78. pixeltable/functions/together.py +1 -1
  79. pixeltable/functions/util.py +1 -1
  80. pixeltable/functions/video.py +2 -2
  81. pixeltable/functions/vision.py +2 -2
  82. pixeltable/globals.py +85 -33
  83. pixeltable/index/embedding_index.py +12 -1
  84. pixeltable/io/__init__.py +8 -5
  85. pixeltable/io/datarows.py +138 -0
  86. pixeltable/io/external_store.py +8 -5
  87. pixeltable/io/fiftyone.py +6 -7
  88. pixeltable/io/globals.py +7 -160
  89. pixeltable/io/hf_datasets.py +21 -98
  90. pixeltable/io/label_studio.py +21 -20
  91. pixeltable/io/pandas.py +35 -48
  92. pixeltable/io/parquet.py +17 -42
  93. pixeltable/io/table_data_conduit.py +569 -0
  94. pixeltable/io/utils.py +6 -21
  95. pixeltable/iterators/__init__.py +1 -1
  96. pixeltable/metadata/__init__.py +6 -4
  97. pixeltable/metadata/converters/convert_24.py +3 -3
  98. pixeltable/metadata/converters/convert_25.py +1 -1
  99. pixeltable/metadata/converters/convert_29.py +1 -1
  100. pixeltable/metadata/converters/convert_30.py +50 -0
  101. pixeltable/metadata/converters/util.py +26 -1
  102. pixeltable/metadata/notes.py +1 -0
  103. pixeltable/metadata/schema.py +3 -0
  104. pixeltable/store.py +2 -2
  105. pixeltable/type_system.py +19 -7
  106. pixeltable/utils/arrow.py +32 -7
  107. pixeltable/utils/console_output.py +3 -2
  108. pixeltable/utils/coroutine.py +3 -3
  109. pixeltable/utils/dbms.py +66 -0
  110. pixeltable/utils/documents.py +61 -67
  111. pixeltable/utils/filecache.py +1 -1
  112. pixeltable/utils/http_server.py +3 -2
  113. pixeltable/utils/pytorch.py +1 -1
  114. pixeltable/utils/sql.py +1 -1
  115. pixeltable-0.3.11.dist-info/METADATA +436 -0
  116. pixeltable-0.3.11.dist-info/RECORD +179 -0
  117. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +1 -1
  118. pixeltable/catalog/path_dict.py +0 -169
  119. pixeltable-0.3.9.dist-info/METADATA +0 -382
  120. pixeltable-0.3.9.dist-info/RECORD +0 -175
  121. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
  122. {pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0
@@ -80,7 +80,7 @@ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], A
80
80
  rolled_kwargs = kwargs.pop(param['name'])
81
81
 
82
82
  if rolled_args is not None:
83
- assert rolled_args['_classname'] in {'InlineArray', 'InlineList'}
83
+ assert rolled_args['_classname'] in ('InlineArray', 'InlineList')
84
84
  new_args.extend(rolled_args['components'])
85
85
  if rolled_kwargs is not None:
86
86
  assert rolled_kwargs['_classname'] == 'InlineDict'
@@ -0,0 +1,50 @@
1
+ import copy
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import (
7
+ convert_table_record,
8
+ convert_table_schema_version_record,
9
+ convert_table_version_record,
10
+ )
11
+ from pixeltable.metadata.schema import Table, TableSchemaVersion, TableVersion
12
+
13
+
14
+ @register_converter(version=30)
15
+ def _(engine: sql.engine.Engine) -> None:
16
+ convert_table_record(engine, table_record_updater=__update_table_record)
17
+ convert_table_version_record(engine, table_version_record_updater=__update_table_version_record)
18
+ convert_table_schema_version_record(
19
+ engine, table_schema_version_record_updater=__update_table_schema_version_record
20
+ )
21
+
22
+
23
+ def __update_table_record(record: Table) -> None:
24
+ """
25
+ Update TableMd with table_id
26
+ """
27
+ assert isinstance(record.md, dict)
28
+ md = copy.copy(record.md)
29
+ md['tbl_id'] = str(record.id)
30
+ record.md = md
31
+
32
+
33
+ def __update_table_version_record(record: TableVersion) -> None:
34
+ """
35
+ Update TableVersion with table_id.
36
+ """
37
+ assert isinstance(record.md, dict)
38
+ md = copy.copy(record.md)
39
+ md['tbl_id'] = str(record.tbl_id)
40
+ record.md = md
41
+
42
+
43
+ def __update_table_schema_version_record(record: TableSchemaVersion) -> None:
44
+ """
45
+ Update TableSchemaVersion with table_id.
46
+ """
47
+ assert isinstance(record.md, dict)
48
+ md = copy.copy(record.md)
49
+ md['tbl_id'] = str(record.tbl_id)
50
+ record.md = md
@@ -5,7 +5,7 @@ from uuid import UUID
5
5
 
6
6
  import sqlalchemy as sql
7
7
 
8
- from pixeltable.metadata.schema import Function, Table, TableSchemaVersion
8
+ from pixeltable.metadata.schema import Function, Table, TableSchemaVersion, TableVersion
9
9
 
10
10
  __logger = logging.getLogger('pixeltable')
11
11
 
@@ -143,3 +143,28 @@ def __update_schema_column(table_schema_version_md: dict, schema_column_updater:
143
143
  assert isinstance(cols, dict)
144
144
  for schema_col in cols.values():
145
145
  schema_column_updater(schema_col)
146
+
147
+
148
+ def convert_table_record(engine: sql.engine.Engine, table_record_updater: Optional[Callable[[Table], None]]) -> None:
149
+ with sql.orm.Session(engine, future=True) as session:
150
+ for record in session.query(Table).all():
151
+ table_record_updater(record)
152
+ session.commit()
153
+
154
+
155
+ def convert_table_version_record(
156
+ engine: sql.engine.Engine, table_version_record_updater: Optional[Callable[[TableVersion], None]]
157
+ ) -> None:
158
+ with sql.orm.Session(engine, future=True) as session:
159
+ for record in session.query(TableVersion).all():
160
+ table_version_record_updater(record)
161
+ session.commit()
162
+
163
+
164
+ def convert_table_schema_version_record(
165
+ engine: sql.engine.Engine, table_schema_version_record_updater: Optional[Callable[[TableSchemaVersion], None]]
166
+ ) -> None:
167
+ with sql.orm.Session(engine, future=True) as session:
168
+ for record in session.query(TableSchemaVersion).all():
169
+ table_schema_version_record_updater(record)
170
+ session.commit()
@@ -2,6 +2,7 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 31: 'Add table ids to metadata structs',
5
6
  30: 'Store default values and constant arguments as literals',
6
7
  29: 'Add user and additional_md fields to metadata structs',
7
8
  28: 'Enable view creation from DataFrame with select clause',
@@ -153,6 +153,7 @@ class ViewMd:
153
153
 
154
154
  @dataclasses.dataclass
155
155
  class TableMd:
156
+ tbl_id: str # uuid.UUID
156
157
  name: str
157
158
 
158
159
  user: Optional[str]
@@ -199,6 +200,7 @@ class Table(Base):
199
200
 
200
201
  @dataclasses.dataclass
201
202
  class TableVersionMd:
203
+ tbl_id: str # uuid.UUID
202
204
  created_at: float # time.time()
203
205
  version: int
204
206
  schema_version: int
@@ -234,6 +236,7 @@ class TableSchemaVersionMd:
234
236
  Records all versioned table metadata.
235
237
  """
236
238
 
239
+ tbl_id: str # uuid.UUID
237
240
  schema_version: int
238
241
  preceding_schema_version: Optional[int]
239
242
  columns: dict[int, SchemaColumn] # col_id -> SchemaColumn
pixeltable/store.py CHANGED
@@ -99,9 +99,9 @@ class StoreBase:
99
99
 
100
100
  # v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
101
101
  idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
102
- idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using='brin'))
102
+ idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
103
103
  idx_name = f'vmax_idx_{self.tbl_version.id.hex}'
104
- idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using='brin'))
104
+ idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
105
105
 
106
106
  self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
107
107
 
pixeltable/type_system.py CHANGED
@@ -512,7 +512,7 @@ class StringType(ColumnType):
512
512
  def __init__(self, nullable: bool = False):
513
513
  super().__init__(self.Type.STRING, nullable=nullable)
514
514
 
515
- def has_supertype(self):
515
+ def has_supertype(self) -> bool:
516
516
  return not self.nullable
517
517
 
518
518
  @classmethod
@@ -602,7 +602,7 @@ class TimestampType(ColumnType):
602
602
  def __init__(self, nullable: bool = False):
603
603
  super().__init__(self.Type.TIMESTAMP, nullable=nullable)
604
604
 
605
- def has_supertype(self):
605
+ def has_supertype(self) -> bool:
606
606
  return not self.nullable
607
607
 
608
608
  @classmethod
@@ -768,7 +768,7 @@ class JsonType(ColumnType):
768
768
  a_type = a.get('type')
769
769
  b_type = b.get('type')
770
770
 
771
- if a_type in {'string', 'integer', 'number', 'boolean', 'object', 'array'} and a_type == b_type:
771
+ if a_type in ('string', 'integer', 'number', 'boolean', 'object', 'array') and a_type == b_type:
772
772
  # a and b both have the same type designation, but are not identical. This can happen if
773
773
  # (for example) they have validators or other attributes that differ. In this case, we
774
774
  # generalize to {'type': t}, where t is their shared type, with no other qualifications.
@@ -1170,6 +1170,20 @@ class DocumentType(ColumnType):
1170
1170
  XML = 3
1171
1171
  TXT = 4
1172
1172
 
1173
+ @classmethod
1174
+ def from_extension(cls, ext: str) -> Optional['DocumentType.DocumentFormat']:
1175
+ if ext in ('.htm', '.html'):
1176
+ return cls.HTML
1177
+ if ext == '.md':
1178
+ return cls.MD
1179
+ if ext == '.pdf':
1180
+ return cls.PDF
1181
+ if ext == '.xml':
1182
+ return cls.XML
1183
+ if ext == '.txt':
1184
+ return cls.TXT
1185
+ return None
1186
+
1173
1187
  def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
1174
1188
  super().__init__(self.Type.DOCUMENT, nullable=nullable)
1175
1189
  self.doc_formats = doc_formats
@@ -1203,9 +1217,7 @@ class DocumentType(ColumnType):
1203
1217
  assert isinstance(val, str)
1204
1218
  from pixeltable.utils.documents import get_document_handle
1205
1219
 
1206
- dh = get_document_handle(val)
1207
- if dh is None:
1208
- raise excs.Error(f'Not a recognized document format: {val}')
1220
+ _ = get_document_handle(val)
1209
1221
 
1210
1222
 
1211
1223
  T = typing.TypeVar('T')
@@ -1240,7 +1252,7 @@ class _PxtType:
1240
1252
  `ColumnType`.
1241
1253
  """
1242
1254
 
1243
- def __init__(self):
1255
+ def __init__(self) -> None:
1244
1256
  raise TypeError(f'Type `{type(self)}` cannot be instantiated.')
1245
1257
 
1246
1258
  @classmethod
pixeltable/utils/arrow.py CHANGED
@@ -11,14 +11,19 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
11
11
  pa.large_string(): ts.StringType(nullable=True),
12
12
  pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
13
13
  pa.bool_(): ts.BoolType(nullable=True),
14
- pa.uint8(): ts.IntType(nullable=True),
15
14
  pa.int8(): ts.IntType(nullable=True),
16
- pa.uint32(): ts.IntType(nullable=True),
17
- pa.uint64(): ts.IntType(nullable=True),
15
+ pa.int16(): ts.IntType(nullable=True),
18
16
  pa.int32(): ts.IntType(nullable=True),
19
17
  pa.int64(): ts.IntType(nullable=True),
18
+ pa.uint8(): ts.IntType(nullable=True),
19
+ pa.uint16(): ts.IntType(nullable=True),
20
+ pa.uint32(): ts.IntType(nullable=True),
21
+ pa.uint64(): ts.IntType(nullable=True),
20
22
  pa.float32(): ts.FloatType(nullable=True),
21
23
  pa.float64(): ts.FloatType(nullable=True),
24
+ pa.date32(): ts.StringType(nullable=True), # date32 is not supported in pixeltable, use string
25
+ pa.date64(): ts.StringType(nullable=True), # date64 is not supported in pixeltable, use string
26
+ pa.binary(): None, # cannot import binary (inline image)
22
27
  }
23
28
 
24
29
  PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
@@ -43,7 +48,7 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
43
48
  return ts.TimestampType(nullable=nullable)
44
49
  elif arrow_type in PA_TO_PXT_TYPES:
45
50
  pt = PA_TO_PXT_TYPES[arrow_type]
46
- return pt.copy(nullable=nullable)
51
+ return pt.copy(nullable=nullable) if pt is not None else None
47
52
  elif isinstance(arrow_type, pa.FixedShapeTensorType):
48
53
  dtype = to_pixeltable_type(arrow_type.value_type, nullable)
49
54
  if dtype is None:
@@ -111,6 +116,28 @@ def iter_tuples(batch: Union[pa.Table, pa.RecordBatch]) -> Iterator[dict[str, An
111
116
  yield {col_name: values[i] for col_name, values in pydict.items()}
112
117
 
113
118
 
119
+ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
120
+ """Convert a value to insertable format"""
121
+ if val is None:
122
+ return None
123
+ if pxt_type.is_float_type():
124
+ return float(val)
125
+ elif pxt_type.is_int_type():
126
+ return int(val)
127
+ elif pxt_type.is_bool_type():
128
+ return bool(val)
129
+ elif pxt_type.is_string_type():
130
+ return str(val)
131
+ elif pxt_type.is_timestamp_type():
132
+ if isinstance(val, str):
133
+ return datetime.datetime.fromisoformat(val)
134
+ if isinstance(val, datetime.datetime):
135
+ return val
136
+ elif pxt_type.is_array_type():
137
+ return pxt_type.create_literal(val)
138
+ raise ValueError(f'Unsupported type {pxt_type} for value {val}')
139
+
140
+
114
141
  def iter_tuples2(
115
142
  batch: Union[pa.Table, pa.RecordBatch], col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
116
143
  ) -> Iterator[dict[str, Any]]:
@@ -124,8 +151,6 @@ def iter_tuples2(
124
151
  for i in range(batch_size):
125
152
  # Convert a row to insertable format
126
153
  yield {
127
- (pxt_name := col_name if col_mapping is None else col_mapping[col_name]): schema[pxt_name].create_literal(
128
- values[i]
129
- )
154
+ (pxt_name := col_mapping.get(col_name, col_name)): _ar_val_to_pxt_val(values[i], schema[pxt_name])
130
155
  for col_name, values in pydict.items()
131
156
  }
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from typing import TextIO
2
3
 
3
4
 
4
5
  def map_level(verbosity: int) -> int:
@@ -22,10 +23,10 @@ def map_level(verbosity: int) -> int:
22
23
 
23
24
 
24
25
  class ConsoleOutputHandler(logging.StreamHandler):
25
- def __init__(self, stream):
26
+ def __init__(self, stream: TextIO):
26
27
  super().__init__(stream)
27
28
 
28
- def emit(self, record):
29
+ def emit(self, record: logging.LogRecord) -> None:
29
30
  if record.msg.endswith('\n'):
30
31
  self.stream.write(record.msg)
31
32
  else:
@@ -7,8 +7,8 @@ T = TypeVar('T')
7
7
 
8
8
 
9
9
  # TODO This is a temporary hack to be able to run async UDFs in contexts that are not properly handled by the existing
10
- # scheduler logic (e.g., inside the eval loop of a JsonMapper). Once the scheduler is fully general, it can be
11
- # removed.
10
+ # scheduler logic (e.g., as an embedding function as part of a similarity lookup). Once the scheduler is fully
11
+ # general, it can be removed.
12
12
 
13
13
 
14
14
  def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: float = 30) -> T:
@@ -16,7 +16,7 @@ def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: floa
16
16
  Runs the given coroutine synchronously, even if called in the context of a running event loop.
17
17
  """
18
18
 
19
- def run_in_new_loop():
19
+ def run_in_new_loop() -> T:
20
20
  new_loop = asyncio.new_event_loop()
21
21
  asyncio.set_event_loop(new_loop)
22
22
  try:
@@ -0,0 +1,66 @@
1
+ import abc
2
+
3
+ from sqlalchemy import URL
4
+
5
+
6
+ class Dbms(abc.ABC):
7
+ """
8
+ Provides abstractions for utilities to interact with a database system.
9
+ """
10
+
11
+ name: str
12
+ transaction_isolation_level: str
13
+ version_index_type: str
14
+ db_url: URL
15
+
16
+ def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: URL) -> None:
17
+ self.name = name
18
+ self.transaction_isolation_level = transaction_isolation_level
19
+ self.version_index_type = version_index_type
20
+ self.db_url = db_url
21
+
22
+ @abc.abstractmethod
23
+ def drop_db_stmt(self, database: str) -> str: ...
24
+
25
+ @abc.abstractmethod
26
+ def create_db_stmt(self, database: str) -> str: ...
27
+
28
+ @abc.abstractmethod
29
+ def default_system_db_url(self) -> str: ...
30
+
31
+
32
+ class PostgresqlDbms(Dbms):
33
+ """
34
+ Implements utilities to interact with Postgres database.
35
+ """
36
+
37
+ def __init__(self, db_url: URL):
38
+ super().__init__('postgresql', 'REPEATABLE READ', 'brin', db_url)
39
+
40
+ def drop_db_stmt(self, database: str) -> str:
41
+ return f'DROP DATABASE {database}'
42
+
43
+ def create_db_stmt(self, database: str) -> str:
44
+ return f"CREATE DATABASE {database} ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
45
+
46
+ def default_system_db_url(self) -> str:
47
+ a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
48
+ return a
49
+
50
+
51
+ class CockroachDbms(Dbms):
52
+ """
53
+ Implements utilities to interact with CockroachDb database.
54
+ """
55
+
56
+ def __init__(self, db_url: URL):
57
+ super().__init__('cockroachdb', 'SERIALIZABLE', 'btree', db_url)
58
+
59
+ def drop_db_stmt(self, database: str) -> str:
60
+ return f'DROP DATABASE {database} CASCADE'
61
+
62
+ def create_db_stmt(self, database: str) -> str:
63
+ return f"CREATE DATABASE {database} TEMPLATE template0 ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C'"
64
+
65
+ def default_system_db_url(self) -> str:
66
+ return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
@@ -1,11 +1,12 @@
1
1
  import dataclasses
2
+ import os
2
3
  from typing import Optional
3
4
 
4
5
  import bs4
5
6
  import fitz # type: ignore[import-untyped]
6
7
  import puremagic
7
8
 
8
- import pixeltable.type_system as ts
9
+ from pixeltable import exceptions as excs, type_system as ts
9
10
  from pixeltable.env import Env
10
11
 
11
12
 
@@ -18,85 +19,78 @@ class DocumentHandle:
18
19
  txt_doc: Optional[str] = None
19
20
 
20
21
 
21
- def get_document_handle(path: str) -> Optional[DocumentHandle]:
22
- doc_format = puremagic.from_file(path)
22
+ def get_document_handle(path: str) -> DocumentHandle:
23
+ _, extension = os.path.splitext(path)
24
+ handle = get_handle_by_extension(path, extension)
25
+ if handle is not None:
26
+ return handle
23
27
 
24
- if doc_format == '.pdf':
25
- pdf_doc = get_pdf_handle(path)
26
- if pdf_doc is not None:
27
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.PDF, pdf_doc=pdf_doc)
28
+ # if no extension, use puremagic to determine the type
29
+ extension = puremagic.from_file(path)
30
+ handle = get_handle_by_extension(path, extension)
31
+ if handle is not None:
32
+ return handle
28
33
 
29
- if doc_format == '.html':
30
- bs_doc = get_html_handle(path)
31
- if bs_doc is not None:
32
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.HTML, bs_doc=bs_doc)
34
+ raise excs.Error(f'Unrecognized document format: {path}')
33
35
 
34
- if doc_format == '.md':
35
- md_ast = get_markdown_handle(path)
36
- if md_ast is not None:
37
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
38
36
 
39
- if doc_format == '.xml':
40
- bs_doc = get_xml_handle(path)
41
- if bs_doc is not None:
42
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
37
+ def get_handle_by_extension(path: str, extension: str) -> Optional[DocumentHandle]:
38
+ doc_format = ts.DocumentType.DocumentFormat.from_extension(extension)
43
39
 
44
- if doc_format == '.txt':
45
- txt_doc = get_txt(path)
46
- if txt_doc is not None:
47
- return DocumentHandle(format=ts.DocumentType.DocumentFormat.TXT, txt_doc=txt_doc)
40
+ try:
41
+ if doc_format == ts.DocumentType.DocumentFormat.HTML:
42
+ return DocumentHandle(doc_format, bs_doc=get_html_handle(path))
43
+ if doc_format == ts.DocumentType.DocumentFormat.MD:
44
+ return DocumentHandle(doc_format, md_ast=get_markdown_handle(path))
45
+ if doc_format == ts.DocumentType.DocumentFormat.PDF:
46
+ return DocumentHandle(doc_format, pdf_doc=get_pdf_handle(path))
47
+ if doc_format == ts.DocumentType.DocumentFormat.XML:
48
+ return DocumentHandle(doc_format, bs_doc=get_xml_handle(path))
49
+ if doc_format == ts.DocumentType.DocumentFormat.TXT:
50
+ return DocumentHandle(doc_format, txt_doc=get_txt(path))
51
+ except Exception as exc:
52
+ raise excs.Error(f'An error occurred processing a {doc_format} document: {path}') from exc
48
53
 
49
54
  return None
50
55
 
51
56
 
52
- def get_pdf_handle(path: str) -> Optional[fitz.Document]:
53
- try:
54
- doc = fitz.open(path)
55
- # check pdf (bc it will work for images)
56
- if not doc.is_pdf:
57
- return None
58
- # try to read one page
59
- next(page for page in doc)
60
- return doc
61
- except Exception:
62
- return None
63
-
64
-
65
- def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
66
- try:
67
- with open(path, 'r', encoding='utf8') as fp:
68
- doc = bs4.BeautifulSoup(fp, 'lxml')
69
- return doc if doc.find() is not None else None
70
- except Exception:
71
- return None
72
-
73
-
74
- def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
75
- try:
76
- with open(path, 'r', encoding='utf8') as fp:
77
- doc = bs4.BeautifulSoup(fp, 'xml')
78
- return doc if doc.find() is not None else None
79
- except Exception:
80
- return None
57
+ def get_html_handle(path: str) -> bs4.BeautifulSoup:
58
+ with open(path, 'r', encoding='utf8') as fp:
59
+ doc = bs4.BeautifulSoup(fp, 'lxml')
60
+ if doc.find() is None:
61
+ raise excs.Error(f'Not a valid HTML document: {path}')
62
+ return doc
81
63
 
82
64
 
83
- def get_markdown_handle(path: str) -> Optional[dict]:
65
+ def get_markdown_handle(path: str) -> dict:
84
66
  Env.get().require_package('mistune', [3, 0])
85
67
  import mistune
86
68
 
87
- try:
88
- with open(path, encoding='utf8') as file:
89
- text = file.read()
90
- md_ast = mistune.create_markdown(renderer=None)
91
- return md_ast(text)
92
- except Exception:
93
- return None
69
+ with open(path, encoding='utf8') as file:
70
+ text = file.read()
71
+ md_ast = mistune.create_markdown(renderer=None)
72
+ return md_ast(text)
94
73
 
95
74
 
96
- def get_txt(path: str) -> Optional[str]:
97
- try:
98
- with open(path, 'r', encoding='utf-8') as fp:
99
- doc = fp.read()
100
- return doc or None # replace '' with None
101
- except Exception:
102
- return None
75
+ def get_pdf_handle(path: str) -> fitz.Document:
76
+ doc = fitz.open(path)
77
+ # check pdf (bc it will work for images)
78
+ if not doc.is_pdf:
79
+ raise excs.Error(f'Not a valid PDF document: {path}')
80
+ # try to read one page
81
+ next(page for page in doc)
82
+ return doc
83
+
84
+
85
+ def get_xml_handle(path: str) -> bs4.BeautifulSoup:
86
+ with open(path, 'r', encoding='utf8') as fp:
87
+ doc = bs4.BeautifulSoup(fp, 'xml')
88
+ if doc.find() is None:
89
+ raise excs.Error(f'Not a valid XML document: {path}')
90
+ return doc
91
+
92
+
93
+ def get_txt(path: str) -> str:
94
+ with open(path, 'r', encoding='utf-8') as fp:
95
+ doc = fp.read()
96
+ return doc
@@ -102,7 +102,7 @@ class FileCache:
102
102
  def init(cls) -> None:
103
103
  cls.__instance = cls()
104
104
 
105
- def __init__(self):
105
+ def __init__(self) -> None:
106
106
  self.cache = OrderedDict()
107
107
  self.total_size = 0
108
108
  self.capacity_bytes = int(Env.get()._file_cache_size_g * (1 << 30))
@@ -3,6 +3,7 @@ import http.server
3
3
  import logging
4
4
  import pathlib
5
5
  import urllib
6
+ from typing import Any
6
7
 
7
8
  _logger = logging.getLogger('pixeltable.http.server')
8
9
 
@@ -38,7 +39,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
38
39
  path = pathlib.Path(urllib.request.url2pathname(path))
39
40
  return str(path)
40
41
 
41
- def log_message(self, format, *args) -> None:
42
+ def log_message(self, format: str, *args: Any) -> None:
42
43
  """override logging to stderr in http.server.BaseHTTPRequestHandler"""
43
44
  message = format % args
44
45
  _logger.info(message.translate(self._control_char_table)) # type: ignore[attr-defined]
@@ -47,7 +48,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
47
48
  class LoggingHTTPServer(http.server.ThreadingHTTPServer):
48
49
  """Avoids polluting stdout and stderr"""
49
50
 
50
- def handle_error(self, request, client_address) -> None:
51
+ def handle_error(self, request, client_address) -> None: # type: ignore[no-untyped-def]
51
52
  """override socketserver.TCPServer.handle_error which prints directly to sys.stderr"""
52
53
  import traceback
53
54
 
@@ -32,7 +32,7 @@ class PixeltablePytorchDataset(torch.utils.data.IterableDataset):
32
32
 
33
33
  self.path = path
34
34
  self.image_format = image_format
35
- assert image_format in {'np', 'pt'}
35
+ assert image_format in ('np', 'pt')
36
36
  column_type_path = path / '.pixeltable.column_types.json'
37
37
  assert column_type_path.exists(), f'missing {column_type_path}'
38
38
  with column_type_path.open() as f:
pixeltable/utils/sql.py CHANGED
@@ -4,7 +4,7 @@ import sqlalchemy as sql
4
4
  from sqlalchemy.dialects import postgresql
5
5
 
6
6
 
7
- def log_stmt(logger: logging.Logger, stmt) -> None:
7
+ def log_stmt(logger: logging.Logger, stmt: sql.sql.ClauseElement) -> None:
8
8
  logger.debug(f'executing {stmt.compile(dialect=postgresql.dialect())}')
9
9
 
10
10