pixeltable 0.2.25__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (97) hide show
  1. pixeltable/__init__.py +2 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/dir.py +6 -0
  5. pixeltable/catalog/globals.py +25 -0
  6. pixeltable/catalog/named_function.py +4 -0
  7. pixeltable/catalog/path_dict.py +37 -11
  8. pixeltable/catalog/schema_object.py +6 -0
  9. pixeltable/catalog/table.py +421 -231
  10. pixeltable/catalog/table_version.py +22 -8
  11. pixeltable/catalog/view.py +5 -7
  12. pixeltable/dataframe.py +439 -105
  13. pixeltable/env.py +19 -5
  14. pixeltable/exec/__init__.py +1 -1
  15. pixeltable/exec/exec_node.py +6 -7
  16. pixeltable/exec/expr_eval_node.py +1 -1
  17. pixeltable/exec/sql_node.py +92 -45
  18. pixeltable/exprs/__init__.py +1 -0
  19. pixeltable/exprs/arithmetic_expr.py +1 -1
  20. pixeltable/exprs/array_slice.py +1 -1
  21. pixeltable/exprs/column_property_ref.py +1 -1
  22. pixeltable/exprs/column_ref.py +29 -2
  23. pixeltable/exprs/comparison.py +1 -1
  24. pixeltable/exprs/compound_predicate.py +1 -1
  25. pixeltable/exprs/expr.py +12 -5
  26. pixeltable/exprs/expr_set.py +8 -0
  27. pixeltable/exprs/function_call.py +147 -39
  28. pixeltable/exprs/in_predicate.py +1 -1
  29. pixeltable/exprs/inline_expr.py +25 -5
  30. pixeltable/exprs/is_null.py +1 -1
  31. pixeltable/exprs/json_mapper.py +1 -1
  32. pixeltable/exprs/json_path.py +1 -1
  33. pixeltable/exprs/method_ref.py +1 -1
  34. pixeltable/exprs/row_builder.py +1 -1
  35. pixeltable/exprs/rowid_ref.py +1 -1
  36. pixeltable/exprs/similarity_expr.py +14 -7
  37. pixeltable/exprs/sql_element_cache.py +4 -0
  38. pixeltable/exprs/type_cast.py +2 -2
  39. pixeltable/exprs/variable.py +3 -0
  40. pixeltable/func/__init__.py +5 -4
  41. pixeltable/func/aggregate_function.py +151 -68
  42. pixeltable/func/callable_function.py +48 -16
  43. pixeltable/func/expr_template_function.py +64 -23
  44. pixeltable/func/function.py +195 -27
  45. pixeltable/func/function_registry.py +2 -1
  46. pixeltable/func/query_template_function.py +51 -9
  47. pixeltable/func/signature.py +64 -7
  48. pixeltable/func/tools.py +153 -0
  49. pixeltable/func/udf.py +57 -35
  50. pixeltable/functions/__init__.py +2 -2
  51. pixeltable/functions/anthropic.py +51 -4
  52. pixeltable/functions/gemini.py +85 -0
  53. pixeltable/functions/globals.py +54 -34
  54. pixeltable/functions/huggingface.py +10 -28
  55. pixeltable/functions/json.py +3 -8
  56. pixeltable/functions/math.py +67 -0
  57. pixeltable/functions/ollama.py +8 -8
  58. pixeltable/functions/openai.py +51 -4
  59. pixeltable/functions/timestamp.py +1 -1
  60. pixeltable/functions/video.py +3 -9
  61. pixeltable/functions/vision.py +1 -1
  62. pixeltable/globals.py +354 -80
  63. pixeltable/index/embedding_index.py +106 -34
  64. pixeltable/io/__init__.py +1 -1
  65. pixeltable/io/label_studio.py +1 -1
  66. pixeltable/io/parquet.py +39 -19
  67. pixeltable/iterators/document.py +12 -0
  68. pixeltable/metadata/__init__.py +1 -1
  69. pixeltable/metadata/converters/convert_16.py +2 -1
  70. pixeltable/metadata/converters/convert_17.py +2 -1
  71. pixeltable/metadata/converters/convert_22.py +17 -0
  72. pixeltable/metadata/converters/convert_23.py +35 -0
  73. pixeltable/metadata/converters/convert_24.py +56 -0
  74. pixeltable/metadata/converters/convert_25.py +19 -0
  75. pixeltable/metadata/converters/util.py +4 -2
  76. pixeltable/metadata/notes.py +4 -0
  77. pixeltable/metadata/schema.py +1 -0
  78. pixeltable/plan.py +128 -50
  79. pixeltable/store.py +1 -1
  80. pixeltable/type_system.py +196 -54
  81. pixeltable/utils/arrow.py +8 -3
  82. pixeltable/utils/description_helper.py +89 -0
  83. pixeltable/utils/documents.py +14 -0
  84. {pixeltable-0.2.25.dist-info → pixeltable-0.3.0.dist-info}/METADATA +30 -20
  85. pixeltable-0.3.0.dist-info/RECORD +155 -0
  86. {pixeltable-0.2.25.dist-info → pixeltable-0.3.0.dist-info}/WHEEL +1 -1
  87. pixeltable-0.3.0.dist-info/entry_points.txt +3 -0
  88. pixeltable/tool/create_test_db_dump.py +0 -311
  89. pixeltable/tool/create_test_video.py +0 -81
  90. pixeltable/tool/doc_plugins/griffe.py +0 -50
  91. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  92. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  93. pixeltable/tool/embed_udf.py +0 -9
  94. pixeltable/tool/mypy_plugin.py +0 -55
  95. pixeltable-0.2.25.dist-info/RECORD +0 -154
  96. pixeltable-0.2.25.dist-info/entry_points.txt +0 -3
  97. {pixeltable-0.2.25.dist-info → pixeltable-0.3.0.dist-info}/LICENSE +0 -0
@@ -37,30 +37,89 @@ class EmbeddingIndex(IndexBase):
37
37
  Metric.L2: 'vector_l2_ops'
38
38
  }
39
39
 
40
+ metric: Metric
41
+ value_expr: exprs.FunctionCall
42
+ string_embed: Optional[func.Function]
43
+ image_embed: Optional[func.Function]
44
+ string_embed_signature_idx: int
45
+ image_embed_signature_idx: int
46
+ index_col_type: pgvector.sqlalchemy.Vector
47
+
40
48
  def __init__(
41
- self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
42
- image_embed: Optional[func.Function] = None):
49
+ self,
50
+ c: catalog.Column,
51
+ metric: str,
52
+ embed: Optional[func.Function] = None,
53
+ string_embed: Optional[func.Function] = None,
54
+ image_embed: Optional[func.Function] = None,
55
+ ):
56
+ if embed is None and string_embed is None and image_embed is None:
57
+ raise excs.Error('At least one of `embed`, `string_embed`, or `image_embed` must be specified')
43
58
  metric_names = [m.name.lower() for m in self.Metric]
44
59
  if metric.lower() not in metric_names:
45
60
  raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
46
61
  if not c.col_type.is_string_type() and not c.col_type.is_image_type():
47
62
  raise excs.Error(f'Embedding index requires string or image column')
48
- if c.col_type.is_string_type() and string_embed is None:
49
- raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
50
- if c.col_type.is_image_type() and image_embed is None:
51
- raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
63
+
64
+ self.string_embed = None
65
+ self.image_embed = None
66
+
67
+ # Resolve the specific embedding functions corresponding to the user-provided `string_embed`, `image_embed`,
68
+ # and/or `embed`. For string embeddings, `string_embed` will be used if specified; otherwise, `embed` will
69
+ # be used as a fallback, if it has a matching signature. Likewise for image embeddings.
70
+
52
71
  if string_embed is not None:
53
- # verify signature
54
- self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
72
+ # `string_embed` is specified; it MUST be valid.
73
+ self.string_embed = self._resolve_embedding_fn(string_embed, ts.ColumnType.Type.STRING)
74
+ if self.string_embed is None:
75
+ raise excs.Error(
76
+ f'The function `{string_embed.name}` is not a valid string embedding: '
77
+ 'it must take a single string parameter'
78
+ )
79
+ elif embed is not None:
80
+ # `embed` is specified; see if it has a string signature.
81
+ self.string_embed = self._resolve_embedding_fn(embed, ts.ColumnType.Type.STRING)
82
+
55
83
  if image_embed is not None:
56
- # verify signature
57
- self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
84
+ # `image_embed` is specified; it MUST be valid.
85
+ self.image_embed = self._resolve_embedding_fn(image_embed, ts.ColumnType.Type.IMAGE)
86
+ if self.image_embed is None:
87
+ raise excs.Error(
88
+ f'The function `{image_embed.name}` is not a valid image embedding: '
89
+ 'it must take a single image parameter'
90
+ )
91
+ elif embed is not None:
92
+ # `embed` is specified; see if it has an image signature.
93
+ self.image_embed = self._resolve_embedding_fn(embed, ts.ColumnType.Type.IMAGE)
94
+
95
+ if self.string_embed is None and self.image_embed is None:
96
+ # No string OR image signature was found. This can only happen if `embed` was specified and
97
+ # contains no matching signatures.
98
+ assert embed is not None
99
+ raise excs.Error(
100
+ f'The function `{embed.name}` is not a valid embedding: '
101
+ 'it must take a single string or image parameter'
102
+ )
103
+
104
+ # Now validate the return types of the embedding functions.
105
+
106
+ if self.string_embed is not None:
107
+ self._validate_embedding_fn(self.string_embed, ts.ColumnType.Type.STRING)
108
+
109
+ if self.image_embed is not None:
110
+ self._validate_embedding_fn(self.image_embed, ts.ColumnType.Type.IMAGE)
111
+
112
+ if c.col_type.is_string_type() and self.string_embed is None:
113
+ raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
114
+ if c.col_type.is_image_type() and self.image_embed is None:
115
+ raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
58
116
 
59
117
  self.metric = self.Metric[metric.upper()]
60
- self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
118
+ self.value_expr = (
119
+ self.string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type()
120
+ else self.image_embed(exprs.ColumnRef(c))
121
+ )
61
122
  assert isinstance(self.value_expr.col_type, ts.ArrayType)
62
- self.string_embed = string_embed
63
- self.image_embed = image_embed
64
123
  vector_size = self.value_expr.col_type.shape[0]
65
124
  assert vector_size is not None
66
125
  self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
@@ -91,10 +150,10 @@ class EmbeddingIndex(IndexBase):
91
150
  assert isinstance(item, (str, PIL.Image.Image))
92
151
  if isinstance(item, str):
93
152
  assert self.string_embed is not None
94
- embedding = self.string_embed.exec(item)
153
+ embedding = self.string_embed.exec([item], {})
95
154
  if isinstance(item, PIL.Image.Image):
96
155
  assert self.image_embed is not None
97
- embedding = self.image_embed.exec(item)
156
+ embedding = self.image_embed.exec([item], {})
98
157
 
99
158
  if self.metric == self.Metric.COSINE:
100
159
  return val_column.sa_col.cosine_distance(embedding) * -1 + 1
@@ -110,10 +169,10 @@ class EmbeddingIndex(IndexBase):
110
169
  embedding: Optional[np.ndarray] = None
111
170
  if isinstance(item, str):
112
171
  assert self.string_embed is not None
113
- embedding = self.string_embed.exec(item)
172
+ embedding = self.string_embed.exec([item], {})
114
173
  if isinstance(item, PIL.Image.Image):
115
174
  assert self.image_embed is not None
116
- embedding = self.image_embed.exec(item)
175
+ embedding = self.image_embed.exec([item], {})
117
176
  assert embedding is not None
118
177
 
119
178
  if self.metric == self.Metric.COSINE:
@@ -132,34 +191,47 @@ class EmbeddingIndex(IndexBase):
132
191
  return 'embedding'
133
192
 
134
193
  @classmethod
135
- def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) -> None:
136
- """Validate the signature"""
194
+ def _resolve_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> Optional[func.Function]:
195
+ """Find an overload resolution for `embed_fn` that matches the given type."""
137
196
  assert isinstance(embed_fn, func.Function)
138
- sig = embed_fn.signature
197
+ for resolved_fn in embed_fn._resolved_fns:
198
+ # The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
199
+ # has more than one parameter, as long as it has at most one *required* parameter.
200
+ sig = resolved_fn.signature
201
+ if (len(sig.parameters) >= 1
202
+ and len(sig.required_parameters) <= 1
203
+ and sig.parameters_by_pos[0].col_type.type_enum == expected_type):
204
+ return resolved_fn
205
+ return None
139
206
 
140
- # The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
141
- # has more than one parameter, as long as it has at most one *required* parameter.
142
- if (len(sig.parameters) == 0
143
- or len(sig.required_parameters) > 1
144
- or sig.parameters_by_pos[0].col_type.type_enum != expected_type):
145
- raise excs.Error(
146
- f'{name} must take a single {expected_type.name.lower()} parameter, but has signature {sig}')
207
+ @classmethod
208
+ def _validate_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> None:
209
+ """Validate the given embedding function."""
210
+ assert not embed_fn.is_polymorphic
211
+ sig = embed_fn.signature
147
212
 
148
213
  # validate return type
149
214
  param_name = sig.parameters_by_pos[0].name
150
215
  if expected_type == ts.ColumnType.Type.STRING:
151
- return_type = embed_fn.call_return_type({param_name: 'dummy'})
216
+ return_type = embed_fn.call_return_type([], {param_name: 'dummy'})
152
217
  else:
153
218
  assert expected_type == ts.ColumnType.Type.IMAGE
154
219
  img = PIL.Image.new('RGB', (512, 512))
155
- return_type = embed_fn.call_return_type({param_name: img})
220
+ return_type = embed_fn.call_return_type([], {param_name: img})
221
+
156
222
  assert return_type is not None
157
223
  if not isinstance(return_type, ts.ArrayType):
158
- raise excs.Error(f'{name} must return an array, but returns {return_type}')
159
- else:
160
- shape = return_type.shape
161
- if len(shape) != 1 or shape[0] == None:
162
- raise excs.Error(f'{name} must return a 1D array of a specific length, but returns {return_type}')
224
+ raise excs.Error(
225
+ f'The function `{embed_fn.name}` is not a valid embedding: '
226
+ f'it must return an array, but returns {return_type}'
227
+ )
228
+
229
+ shape = return_type.shape
230
+ if len(shape) != 1 or shape[0] == None:
231
+ raise excs.Error(
232
+ f'The function `{embed_fn.name}` is not a valid embedding: '
233
+ f'it must return a 1-dimensional array of a specific length, but returns {return_type}'
234
+ )
163
235
 
164
236
  def as_dict(self) -> dict:
165
237
  return {
pixeltable/io/__init__.py CHANGED
@@ -2,7 +2,7 @@ from .external_store import ExternalStore, SyncStatus
2
2
  from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
3
3
  from .hf_datasets import import_huggingface_dataset
4
4
  from .pandas import import_csv, import_excel, import_pandas
5
- from .parquet import import_parquet
5
+ from .parquet import import_parquet, export_parquet
6
6
 
7
7
  __default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
8
8
  __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}
@@ -574,7 +574,7 @@ class LabelStudioProject(Project):
574
574
  else:
575
575
  local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
576
576
  if local_annotations_column not in t._schema.keys():
577
- t[local_annotations_column] = pxt.JsonType(nullable=True)
577
+ t.add_columns({local_annotations_column: pxt.JsonType(nullable=True)})
578
578
 
579
579
  resolved_col_mapping = cls.validate_columns(
580
580
  t, config.export_columns, {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}, col_mapping)
pixeltable/io/parquet.py CHANGED
@@ -7,11 +7,14 @@ import random
7
7
  import typing
8
8
  from collections import deque
9
9
  from pathlib import Path
10
- from typing import Any, Optional
10
+ from typing import Any, Optional, Union
11
11
 
12
12
  import numpy as np
13
13
  import PIL.Image
14
+ import datetime
14
15
 
16
+ import pixeltable as pxt
17
+ from pixeltable.env import Env
15
18
  import pixeltable.exceptions as exc
16
19
  import pixeltable.type_system as ts
17
20
  from pixeltable.utils.transactional_directory import transactional_directory
@@ -39,28 +42,44 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
39
42
  parquet.write_table(tab, str(output_path))
40
43
 
41
44
 
42
- def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
45
+ def export_parquet(
46
+ table_or_df: Union[pxt.Table, pxt.DataFrame],
47
+ parquet_path: Path,
48
+ partition_size_bytes: int = 100_000_000,
49
+ inline_images: bool = False
50
+ ) -> None:
43
51
  """
44
- Internal method to stream dataframe data to parquet format.
45
- Does not materialize the dataset to memory.
52
+ Exports a dataframe's data to one or more Parquet files. Requires pyarrow to be installed.
46
53
 
47
- It preserves pixeltable type metadata in a json file, which would otherwise
54
+ It additionally writes the pixeltable metadata in a json file, which would otherwise
48
55
  not be available in the parquet format.
49
56
 
50
- Images are stored inline in a compressed format in their parquet file.
51
-
52
57
  Args:
53
- df : dataframe to save.
54
- dest_path : path to directory to save the parquet files to.
55
- partition_size_bytes : maximum target size for each chunk. Default 100_000_000 bytes.
58
+ table_or_df : Table or Dataframe to export.
59
+ parquet_path : Path to directory to write the parquet files to.
60
+ partition_size_bytes : The maximum target size for each chunk. Default 100_000_000 bytes.
61
+ inline_images : If True, images are stored inline in the parquet file. This is useful
62
+ for small images, to be imported as pytorch dataset. But can be inefficient
63
+ for large images, and cannot be imported into pixeltable.
64
+ If False, will raise an error if the Dataframe has any image column.
65
+ Default False.
56
66
  """
57
67
  from pixeltable.utils.arrow import to_arrow_schema
58
68
 
69
+ df: pxt.DataFrame
70
+ if isinstance(table_or_df, pxt.catalog.Table):
71
+ df = table_or_df._df()
72
+ else:
73
+ df = table_or_df
74
+
59
75
  type_dict = {k: v.as_dict() for k, v in df.schema.items()}
60
76
  arrow_schema = to_arrow_schema(df.schema)
61
77
 
78
+ if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
79
+ raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
80
+
62
81
  # store the changes atomically
63
- with transactional_directory(dest_path) as temp_path:
82
+ with transactional_directory(parquet_path) as temp_path:
64
83
  # dump metadata json file so we can inspect what was the source of the parquet file later on.
65
84
  json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
66
85
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
@@ -111,6 +130,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
111
130
  elif col_type.is_bool_type():
112
131
  length = 1
113
132
  elif col_type.is_timestamp_type():
133
+ val = val.astimezone(datetime.timezone.utc)
114
134
  length = 8
115
135
  else:
116
136
  assert False, f'unknown type {col_type} for {col_name}'
@@ -139,7 +159,7 @@ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional
139
159
 
140
160
 
141
161
  def import_parquet(
142
- table_path: str,
162
+ table: str,
143
163
  *,
144
164
  parquet_path: str,
145
165
  schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
@@ -148,7 +168,7 @@ def import_parquet(
148
168
  """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
149
169
 
150
170
  Args:
151
- table_path: Path to the table.
171
+ table: Fully qualified name of the table to import the data into.
152
172
  parquet_path: Path to an individual Parquet file or directory of Parquet files.
153
173
  schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
154
174
  name `name` will be given type `type`, instead of being inferred from the Parquet dataset. The keys in
@@ -157,7 +177,7 @@ def import_parquet(
157
177
  kwargs: Additional arguments to pass to `create_table`.
158
178
 
159
179
  Returns:
160
- A handle to the newly created [`Table`][pixeltable.Table].
180
+ A handle to the newly created table.
161
181
  """
162
182
  from pyarrow import parquet
163
183
 
@@ -176,11 +196,11 @@ def import_parquet(
176
196
  if v is None:
177
197
  raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
178
198
 
179
- if table_path in pxt.list_tables():
180
- raise exc.Error(f'Table {table_path} already exists')
199
+ if table in pxt.list_tables():
200
+ raise exc.Error(f'Table {table} already exists')
181
201
 
182
202
  try:
183
- tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
203
+ tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
184
204
  tab = pxt.create_table(tmp_name, schema, **kwargs)
185
205
  for fragment in parquet_dataset.fragments: # type: ignore[attr-defined]
186
206
  for batch in fragment.to_batches():
@@ -190,5 +210,5 @@ def import_parquet(
190
210
  _logger.error(f'Error while inserting Parquet file into table: {e}')
191
211
  raise e
192
212
 
193
- pxt.move(tmp_name, table_path)
194
- return pxt.get_table(table_path)
213
+ pxt.move(tmp_name, table)
214
+ return pxt.get_table(table)
@@ -151,6 +151,9 @@ class DocumentSplitter(ComponentIterator):
151
151
  elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
152
152
  assert self._doc_handle.pdf_doc is not None
153
153
  self._sections = self._pdf_sections()
154
+ elif self._doc_handle.format == DocumentType.DocumentFormat.TXT:
155
+ assert self._doc_handle.txt_doc is not None
156
+ self._sections = self._txt_sections()
154
157
  else:
155
158
  assert False, f'Unsupported document format: {self._doc_handle.format}'
156
159
 
@@ -389,6 +392,15 @@ class DocumentSplitter(ComponentIterator):
389
392
  if accumulated_text and not emit_on_page:
390
393
  yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
391
394
 
395
+ def _txt_sections(self) -> Iterator[DocumentSection]:
396
+ """Create DocumentSections for text files.
397
+
398
+ Currently, it returns the entire text as a single section.
399
+ TODO: Add support for paragraphs.
400
+ """
401
+ assert self._doc_handle.txt_doc is not None
402
+ yield DocumentSection(text=ftfy.fix_text(self._doc_handle.txt_doc), metadata=DocumentSectionMetadata())
403
+
392
404
  def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
393
405
  """Split the input sections into sentences"""
394
406
  for section in input_sections:
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 22
13
+ VERSION = 26
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -1,3 +1,4 @@
1
+ from uuid import UUID
1
2
  import sqlalchemy as sql
2
3
 
3
4
  from pixeltable.metadata import register_converter
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
12
13
  )
13
14
 
14
15
 
15
- def __update_table_md(table_md: dict) -> None:
16
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
16
17
  # External stores are not migratable; just drop them
17
18
  del table_md['remotes']
18
19
  table_md['external_stores'] = {}
@@ -1,3 +1,4 @@
1
+ from uuid import UUID
1
2
  import sqlalchemy as sql
2
3
 
3
4
  from pixeltable.metadata import register_converter
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
12
13
  )
13
14
 
14
15
 
15
- def __update_table_md(table_md: dict) -> None:
16
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
16
17
  # key changes in IndexMd.init_args: img_embed -> image_embed, txt_embed -> string_embed
17
18
  if len(table_md['index_md']) == 0:
18
19
  return
@@ -0,0 +1,17 @@
1
+ from typing import Any, Optional
2
+ import sqlalchemy as sql
3
+
4
+ from pixeltable.metadata import register_converter
5
+ from pixeltable.metadata.converters.util import convert_table_md
6
+
7
+
8
+ @register_converter(version=22)
9
+ def _(engine: sql.engine.Engine) -> None:
10
+ convert_table_md(engine, substitution_fn=__substitute_md)
11
+
12
+
13
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
14
+ if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'DataFrame':
15
+ v['from_clause'] = {'tbls': [v['tbl']], 'join_clauses': []}
16
+ return k, v
17
+ return None
@@ -0,0 +1,35 @@
1
+ import logging
2
+ from typing import Any, Optional
3
+ from uuid import UUID
4
+ import sqlalchemy as sql
5
+
6
+ from pixeltable.metadata import register_converter
7
+ from pixeltable.metadata.converters.util import convert_table_md
8
+ from pixeltable.metadata.schema import Table
9
+
10
+ _logger = logging.getLogger('pixeltable')
11
+
12
+ @register_converter(version=23)
13
+ def _(engine: sql.engine.Engine) -> None:
14
+ convert_table_md(
15
+ engine,
16
+ table_md_updater=__update_table_md
17
+ )
18
+
19
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
20
+ """update the index metadata to add indexed_col_tbl_id column if it is missing
21
+
22
+ Args:
23
+ table_md (dict): copy of the original table metadata. this gets updated in place.
24
+ table_id (UUID): the table id
25
+
26
+ """
27
+ if len(table_md['index_md']) == 0:
28
+ return
29
+ for idx_md in table_md['index_md'].values():
30
+ if 'indexed_col_tbl_id' not in idx_md:
31
+ # index metadata is missing indexed_col_tbl_id
32
+ # assume that the indexed column is in the same table
33
+ # and update the index metadata.
34
+ _logger.info(f'Updating index metadata for table: {table_id} index: {idx_md["id"]}')
35
+ idx_md['indexed_col_tbl_id'] = str(table_id)
@@ -0,0 +1,56 @@
1
+ from typing import Any, Optional
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ @register_converter(version=24)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ convert_table_md(engine, substitution_fn=__substitute_md)
12
+
13
+
14
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
15
+ from pixeltable import func
16
+ from pixeltable.func.globals import resolve_symbol
17
+
18
+ if (isinstance(v, dict) and
19
+ '_classpath' in v and
20
+ v['_classpath'] in ['pixeltable.func.callable_function.CallableFunction',
21
+ 'pixeltable.func.aggregate_function.AggregateFunction',
22
+ 'pixeltable.func.expr_template_function.ExprTemplateFunction']):
23
+ if 'path' in v:
24
+ assert 'signature' not in v
25
+ f = resolve_symbol(__substitute_path(v['path']))
26
+ assert isinstance(f, func.Function)
27
+ v['signature'] = f.signatures[0].as_dict()
28
+ return k, v
29
+
30
+ if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'FunctionCall':
31
+ # Correct an older serialization mechanism where Expr elements of FunctionCall args and
32
+ # kwargs were indicated with idx == -1 rather than None. This was fixed for InlineList
33
+ # and InlineDict back in convert_20, but not for FunctionCall.
34
+ assert 'args' in v and isinstance(v['args'], list)
35
+ assert 'kwargs' in v and isinstance(v['kwargs'], dict)
36
+ v['args'] = [
37
+ (None, arg) if idx == -1 else (idx, arg)
38
+ for idx, arg in v['args']
39
+ ]
40
+ v['kwargs'] = {
41
+ k: (None, arg) if idx == -1 else (idx, arg)
42
+ for k, (idx, arg) in v['kwargs'].items()
43
+ }
44
+ return k, v
45
+
46
+ return None
47
+
48
+
49
+ def __substitute_path(path: str) -> str:
50
+ # Starting with version 25, function signatures are preserved in metadata. To migrate from older
51
+ # versions, it's necessary to resolve the function symbol to get the signature. The following
52
+ # adjustment is necessary for function names that are stored in db artifacts of version < 25, but
53
+ # have changed in some version > 25.
54
+ if path in ['pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image']:
55
+ return 'pixeltable.functions.huggingface.clip'
56
+ return path
@@ -0,0 +1,19 @@
1
+ from typing import Any, Optional
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ @register_converter(version=25)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ convert_table_md(engine, substitution_fn=__substitute_md)
12
+
13
+
14
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
15
+ if k == 'path' and (
16
+ v in ['pixeltable.functions.huggingface.clip_text', 'pixeltable.functions.huggingface.clip_image']
17
+ ):
18
+ return 'path', 'pixeltable.functions.huggingface.clip'
19
+ return None
@@ -1,6 +1,7 @@
1
1
  import copy
2
2
  import logging
3
3
  from typing import Any, Callable, Optional
4
+ from uuid import UUID
4
5
 
5
6
  import sqlalchemy as sql
6
7
 
@@ -11,7 +12,7 @@ __logger = logging.getLogger('pixeltable')
11
12
 
12
13
  def convert_table_md(
13
14
  engine: sql.engine.Engine,
14
- table_md_updater: Optional[Callable[[dict], None]] = None,
15
+ table_md_updater: Optional[Callable[[dict, UUID], None]] = None,
15
16
  column_md_updater: Optional[Callable[[dict], None]] = None,
16
17
  external_store_md_updater: Optional[Callable[[dict], None]] = None,
17
18
  substitution_fn: Optional[Callable[[Optional[str], Any], Optional[tuple[Optional[str], Any]]]] = None
@@ -22,6 +23,7 @@ def convert_table_md(
22
23
  Args:
23
24
  engine: The SQLAlchemy engine.
24
25
  table_md_updater: A function that updates schema.TableMd dicts in place.
26
+ It takes two arguments: the metadata dict (new values) and the table id.
25
27
  column_md_updater: A function that updates schema.ColumnMd dicts in place.
26
28
  external_store_md_updater: A function that updates the external store metadata in place.
27
29
  substitution_fn: A function that substitutes metadata values. If specified, all metadata will be traversed
@@ -37,7 +39,7 @@ def convert_table_md(
37
39
  assert isinstance(table_md, dict)
38
40
  updated_table_md = copy.deepcopy(table_md)
39
41
  if table_md_updater is not None:
40
- table_md_updater(updated_table_md)
42
+ table_md_updater(updated_table_md, id)
41
43
  if column_md_updater is not None:
42
44
  __update_column_md(updated_table_md, column_md_updater)
43
45
  if external_store_md_updater is not None:
@@ -2,6 +2,10 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 26: 'Rename clip_text and clip_image to clip',
6
+ 25: 'Functions with multiple signatures',
7
+ 24: 'Added TableMd/IndexMd.indexed_col_tbl_id',
8
+ 23: 'DataFrame.from_clause',
5
9
  22: 'TableMd/ColumnMd.media_validation',
6
10
  21: 'Separate InlineArray and InlineList',
7
11
  20: 'Store DB timestamps in UTC',
@@ -112,6 +112,7 @@ class IndexMd:
112
112
  """
113
113
  id: int
114
114
  name: str
115
+ indexed_col_tbl_id: str # UUID of the table (as string) that contains column being indexed
115
116
  indexed_col_id: int # column being indexed
116
117
  index_val_col_id: int # column holding the values to be indexed
117
118
  index_val_undo_col_id: int # column holding index values for deleted rows