pixeltable 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (56) hide show
  1. pixeltable/__init__.py +3 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/column.py +14 -2
  4. pixeltable/catalog/insertable_table.py +32 -17
  5. pixeltable/catalog/table.py +194 -12
  6. pixeltable/catalog/table_version.py +270 -110
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/datatransfer/__init__.py +1 -0
  9. pixeltable/datatransfer/label_studio.py +526 -0
  10. pixeltable/datatransfer/remote.py +113 -0
  11. pixeltable/env.py +156 -73
  12. pixeltable/exprs/column_ref.py +2 -2
  13. pixeltable/exprs/comparison.py +39 -1
  14. pixeltable/exprs/data_row.py +7 -0
  15. pixeltable/exprs/expr.py +11 -12
  16. pixeltable/exprs/function_call.py +0 -3
  17. pixeltable/exprs/globals.py +14 -2
  18. pixeltable/exprs/similarity_expr.py +5 -3
  19. pixeltable/ext/functions/whisperx.py +30 -0
  20. pixeltable/ext/functions/yolox.py +16 -0
  21. pixeltable/func/aggregate_function.py +2 -2
  22. pixeltable/func/expr_template_function.py +3 -1
  23. pixeltable/func/udf.py +2 -2
  24. pixeltable/functions/fireworks.py +9 -4
  25. pixeltable/functions/huggingface.py +25 -1
  26. pixeltable/functions/openai.py +15 -10
  27. pixeltable/functions/together.py +11 -6
  28. pixeltable/functions/util.py +0 -43
  29. pixeltable/functions/video.py +46 -8
  30. pixeltable/globals.py +20 -2
  31. pixeltable/index/__init__.py +1 -0
  32. pixeltable/index/base.py +6 -1
  33. pixeltable/index/btree.py +54 -0
  34. pixeltable/index/embedding_index.py +4 -1
  35. pixeltable/io/__init__.py +1 -0
  36. pixeltable/io/globals.py +59 -0
  37. pixeltable/iterators/base.py +4 -4
  38. pixeltable/iterators/document.py +26 -15
  39. pixeltable/iterators/video.py +9 -1
  40. pixeltable/metadata/__init__.py +2 -2
  41. pixeltable/metadata/converters/convert_14.py +13 -0
  42. pixeltable/metadata/converters/convert_15.py +29 -0
  43. pixeltable/metadata/converters/util.py +63 -0
  44. pixeltable/metadata/schema.py +12 -6
  45. pixeltable/plan.py +9 -5
  46. pixeltable/store.py +14 -21
  47. pixeltable/tool/create_test_db_dump.py +16 -0
  48. pixeltable/type_system.py +14 -4
  49. pixeltable/utils/coco.py +94 -0
  50. pixeltable-0.2.7.dist-info/METADATA +137 -0
  51. {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/RECORD +53 -46
  52. pixeltable/func/nos_function.py +0 -202
  53. pixeltable/utils/clip.py +0 -18
  54. pixeltable-0.2.6.dist-info/METADATA +0 -131
  55. {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
  56. {pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +0 -0
@@ -6,8 +6,13 @@ import pixeltable as pxt
6
6
  from pixeltable import env
7
7
 
8
8
 
9
- def fireworks_client() -> fireworks.client.Fireworks:
10
- return env.Env.get().get_client('fireworks', lambda api_key: fireworks.client.Fireworks(api_key=api_key))
9
+ @env.register_client('fireworks')
10
+ def _(api_key: str) -> fireworks.client.Fireworks:
11
+ return fireworks.client.Fireworks(api_key=api_key)
12
+
13
+
14
+ def _fireworks_client() -> fireworks.client.Fireworks:
15
+ return env.Env.get().get_client('fireworks')
11
16
 
12
17
 
13
18
  @pxt.udf
@@ -26,8 +31,8 @@ def chat_completions(
26
31
  'top_p': top_p,
27
32
  'temperature': temperature
28
33
  }
29
- kwargs_not_none = dict(filter(lambda x: x[1] is not None, kwargs.items()))
30
- return fireworks_client().chat.completions.create(
34
+ kwargs_not_none = {k: v for k, v in kwargs.items() if v is not None}
35
+ return _fireworks_client().chat.completions.create(
31
36
  model=model,
32
37
  messages=messages,
33
38
  **kwargs_not_none
@@ -1,4 +1,4 @@
1
- from typing import Callable, TypeVar, Optional
1
+ from typing import Callable, TypeVar, Optional, Any
2
2
 
3
3
  import PIL.Image
4
4
  import numpy as np
@@ -14,6 +14,7 @@ from pixeltable.functions.util import resolve_torch_device
14
14
  def sentence_transformer(
15
15
  sentences: Batch[str], *, model_id: str, normalize_embeddings: bool = False
16
16
  ) -> Batch[np.ndarray]:
17
+ """Runs the specified sentence transformer model."""
17
18
  env.Env.get().require_package('sentence_transformers')
18
19
  from sentence_transformers import SentenceTransformer
19
20
 
@@ -46,6 +47,7 @@ def sentence_transformer_list(sentences: list, *, model_id: str, normalize_embed
46
47
 
47
48
  @pxt.udf(batch_size=32)
48
49
  def cross_encoder(sentences1: Batch[str], sentences2: Batch[str], *, model_id: str) -> Batch[float]:
50
+ """Runs the specified cross-encoder model."""
49
51
  env.Env.get().require_package('sentence_transformers')
50
52
  from sentence_transformers import CrossEncoder
51
53
 
@@ -68,6 +70,7 @@ def cross_encoder_list(sentence1: str, sentences2: list, *, model_id: str) -> li
68
70
 
69
71
  @pxt.udf(batch_size=32, return_type=ts.ArrayType((None,), dtype=ts.FloatType(), nullable=False))
70
72
  def clip_text(text: Batch[str], *, model_id: str) -> Batch[np.ndarray]:
73
+ """Runs the specified CLIP model on text."""
71
74
  env.Env.get().require_package('transformers')
72
75
  device = resolve_torch_device('auto')
73
76
  import torch
@@ -85,6 +88,7 @@ def clip_text(text: Batch[str], *, model_id: str) -> Batch[np.ndarray]:
85
88
 
86
89
  @pxt.udf(batch_size=32, return_type=ts.ArrayType((None,), dtype=ts.FloatType(), nullable=False))
87
90
  def clip_image(image: Batch[PIL.Image.Image], *, model_id: str) -> Batch[np.ndarray]:
91
+ """Runs the specified CLIP model on images."""
88
92
  env.Env.get().require_package('transformers')
89
93
  device = resolve_torch_device('auto')
90
94
  import torch
@@ -113,6 +117,7 @@ def _(model_id: str) -> ts.ArrayType:
113
117
 
114
118
  @pxt.udf(batch_size=4)
115
119
  def detr_for_object_detection(image: Batch[PIL.Image.Image], *, model_id: str, threshold: float = 0.5) -> Batch[dict]:
120
+ """Runs the specified DETR model."""
116
121
  env.Env.get().require_package('transformers')
117
122
  device = resolve_torch_device('auto')
118
123
  import torch
@@ -140,6 +145,25 @@ def detr_for_object_detection(image: Batch[PIL.Image.Image], *, model_id: str, t
140
145
  ]
141
146
 
142
147
 
148
+ @pxt.udf
149
+ def detr_to_coco(image: PIL.Image.Image, detr_info: dict[str, Any]) -> dict[str, Any]:
150
+ bboxes, labels = detr_info['boxes'], detr_info['labels']
151
+ annotations = [
152
+ {
153
+ 'bbox': [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]],
154
+ 'category': label
155
+ }
156
+ for bbox, label in zip(bboxes, labels)
157
+ ]
158
+ return {
159
+ 'image': {
160
+ 'width': image.width,
161
+ 'height': image.height
162
+ },
163
+ 'annotations': annotations
164
+ }
165
+
166
+
143
167
  T = TypeVar('T')
144
168
 
145
169
 
@@ -16,8 +16,13 @@ from pixeltable import env
16
16
  from pixeltable.func import Batch
17
17
 
18
18
 
19
- def openai_client() -> openai.OpenAI:
20
- return env.Env.get().get_client('openai', lambda api_key: openai.OpenAI(api_key=api_key))
19
+ @env.register_client('openai')
20
+ def _(api_key: str) -> openai.OpenAI:
21
+ return openai.OpenAI(api_key=api_key)
22
+
23
+
24
+ def _openai_client() -> openai.OpenAI:
25
+ return env.Env.get().get_client('openai')
21
26
 
22
27
 
23
28
  # Exponential backoff decorator using tenacity.
@@ -44,7 +49,7 @@ def speech(
44
49
  response_format: Optional[str] = None,
45
50
  speed: Optional[float] = None
46
51
  ) -> str:
47
- content = openai_client().audio.speech.create(
52
+ content = _openai_client().audio.speech.create(
48
53
  input=input,
49
54
  model=model,
50
55
  voice=voice,
@@ -71,7 +76,7 @@ def transcriptions(
71
76
  temperature: Optional[float] = None
72
77
  ) -> dict:
73
78
  file = pathlib.Path(audio)
74
- transcription = openai_client().audio.transcriptions.create(
79
+ transcription = _openai_client().audio.transcriptions.create(
75
80
  file=file,
76
81
  model=model,
77
82
  language=_opt(language),
@@ -93,7 +98,7 @@ def translations(
93
98
  temperature: Optional[float] = None
94
99
  ) -> dict:
95
100
  file = pathlib.Path(audio)
96
- translation = openai_client().audio.translations.create(
101
+ translation = _openai_client().audio.translations.create(
97
102
  file=file,
98
103
  model=model,
99
104
  prompt=_opt(prompt),
@@ -127,7 +132,7 @@ def chat_completions(
127
132
  tool_choice: Optional[dict] = None,
128
133
  user: Optional[str] = None
129
134
  ) -> dict:
130
- result = openai_client().chat.completions.create(
135
+ result = _openai_client().chat.completions.create(
131
136
  messages=messages,
132
137
  model=model,
133
138
  frequency_penalty=_opt(frequency_penalty),
@@ -171,7 +176,7 @@ def vision(
171
176
  }}
172
177
  ]}
173
178
  ]
174
- result = openai_client().chat.completions.create(
179
+ result = _openai_client().chat.completions.create(
175
180
  messages=messages,
176
181
  model=model
177
182
  )
@@ -197,7 +202,7 @@ def embeddings(
197
202
  dimensions: Optional[int] = None,
198
203
  user: Optional[str] = None
199
204
  ) -> Batch[np.ndarray]:
200
- result = openai_client().embeddings.create(
205
+ result = _openai_client().embeddings.create(
201
206
  input=input,
202
207
  model=model,
203
208
  dimensions=_opt(dimensions),
@@ -235,7 +240,7 @@ def image_generations(
235
240
  user: Optional[str] = None
236
241
  ) -> PIL.Image.Image:
237
242
  # TODO(aaron-siegel): Decompose CPU/GPU ops into separate functions
238
- result = openai_client().images.generate(
243
+ result = _openai_client().images.generate(
239
244
  prompt=prompt,
240
245
  model=_opt(model),
241
246
  quality=_opt(quality),
@@ -275,7 +280,7 @@ def moderations(
275
280
  *,
276
281
  model: Optional[str] = None
277
282
  ) -> dict:
278
- result = openai_client().moderations.create(
283
+ result = _openai_client().moderations.create(
279
284
  input=input,
280
285
  model=_opt(model)
281
286
  )
@@ -11,8 +11,13 @@ from pixeltable import env
11
11
  from pixeltable.func import Batch
12
12
 
13
13
 
14
- def together_client() -> together.Together:
15
- return env.Env.get().get_client('together', lambda api_key: together.Together(api_key=api_key))
14
+ @env.register_client('together')
15
+ def _(api_key: str) -> together.Together:
16
+ return together.Together(api_key=api_key)
17
+
18
+
19
+ def _together_client() -> together.Together:
20
+ return env.Env.get().get_client('together')
16
21
 
17
22
 
18
23
  @pxt.udf
@@ -31,7 +36,7 @@ def completions(
31
36
  n: Optional[int] = None,
32
37
  safety_model: Optional[str] = None
33
38
  ) -> dict:
34
- return together_client().completions.create(
39
+ return _together_client().completions.create(
35
40
  prompt=prompt,
36
41
  model=model,
37
42
  max_tokens=max_tokens,
@@ -66,7 +71,7 @@ def chat_completions(
66
71
  tools: Optional[dict] = None,
67
72
  tool_choice: Optional[dict] = None
68
73
  ) -> dict:
69
- return together_client().chat.completions.create(
74
+ return _together_client().chat.completions.create(
70
75
  messages=messages,
71
76
  model=model,
72
77
  max_tokens=max_tokens,
@@ -99,7 +104,7 @@ _embedding_dimensions_cache = {
99
104
 
100
105
  @pxt.udf(batch_size=32, return_type=pxt.ArrayType((None,), dtype=pxt.FloatType()))
101
106
  def embeddings(input: Batch[str], *, model: str) -> Batch[np.ndarray]:
102
- result = together_client().embeddings.create(input=input, model=model)
107
+ result = _together_client().embeddings.create(input=input, model=model)
103
108
  return [
104
109
  np.array(data.embedding, dtype=np.float64)
105
110
  for data in result.data
@@ -127,7 +132,7 @@ def image_generations(
127
132
  negative_prompt: Optional[str] = None,
128
133
  ) -> PIL.Image.Image:
129
134
  # TODO(aaron-siegel): Decompose CPU/GPU ops into separate functions
130
- result = together_client().images.generate(
135
+ result = _together_client().images.generate(
131
136
  prompt=prompt,
132
137
  model=model,
133
138
  steps=steps,
@@ -1,46 +1,3 @@
1
- from typing import Tuple, List, Optional
2
- import types
3
- import sys
4
-
5
- import pixeltable.func as func
6
- import pixeltable.type_system as ts
7
- import pixeltable.env as env
8
-
9
-
10
- def create_nos_modules() -> List[types.ModuleType]:
11
- """Create module pixeltable.functions.nos with one submodule per task and return the submodules"""
12
- models = env.Env.get().nos_client.ListModels()
13
- model_info = [env.Env.get().nos_client.GetModelInfo(model) for model in models]
14
- model_info.sort(key=lambda info: info.task.value)
15
-
16
- module_name = 'pixeltable.functions.nos'
17
- nos_module = types.ModuleType(module_name)
18
- nos_module.__package__ = 'pixeltable.functions'
19
- sys.modules[module_name] = nos_module
20
-
21
- prev_task = ''
22
- new_modules: List[types.ModuleType] = []
23
- sub_module: Optional[types.ModuleType] = None
24
- for info in model_info:
25
- if info.task.value != prev_task:
26
- # we construct one submodule per task
27
- namespace = info.task.name.lower()
28
- submodule_name = f'{module_name}.{namespace}'
29
- sub_module = types.ModuleType(submodule_name)
30
- sub_module.__package__ = module_name
31
- setattr(nos_module, namespace, sub_module)
32
- new_modules.append(sub_module)
33
- sys.modules[submodule_name] = sub_module
34
- prev_task = info.task.value
35
-
36
- # add a Function for this model to the module
37
- model_id = info.name.replace("/", "_").replace("-", "_")
38
- pt_func = func.NOSFunction(info, f'{submodule_name}.{model_id}')
39
- setattr(sub_module, model_id, pt_func)
40
-
41
- return new_modules
42
-
43
-
44
1
  def resolve_torch_device(device: str) -> str:
45
2
  import torch
46
3
  if device == 'auto':
@@ -1,14 +1,13 @@
1
- from typing import Optional
2
1
  import uuid
2
+ from typing import Optional
3
+
3
4
  import av
4
- import sys
5
5
 
6
6
  import pixeltable.env as env
7
7
  import pixeltable.func as func
8
8
  import pixeltable.type_system as ts
9
9
 
10
-
11
- _format_defaults = { # format -> (codec, ext)
10
+ _format_defaults = { # format -> (codec, ext)
12
11
  'wav': ('pcm_s16le', 'wav'),
13
12
  'mp3': ('libmp3lame', 'mp3'),
14
13
  'flac': ('flac', 'flac'),
@@ -35,11 +34,13 @@ _extract_audio_param_types = [
35
34
  ts.VideoType(nullable=False),
36
35
  ts.IntType(nullable=False),
37
36
  ts.StringType(nullable=False),
38
- ts.StringType(nullable=False)
37
+ ts.StringType(nullable=True),
39
38
  ]
39
+
40
+
40
41
  @func.udf(return_type=ts.AudioType(nullable=True), param_types=_extract_audio_param_types)
41
42
  def extract_audio(
42
- video_path: str, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
43
+ video_path: str, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
43
44
  ) -> Optional[str]:
44
45
  """Extract an audio stream from a video file, save it as a media file and return its path"""
45
46
  if format not in _format_defaults:
@@ -51,12 +52,49 @@ def extract_audio(
51
52
  return None
52
53
  audio_stream = container.streams.audio[stream_idx]
53
54
  # create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
54
- output_filename = str(env.Env.get().tmp_dir / f"{uuid.uuid4()}.{ext}")
55
+ output_filename = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}.{ext}')
55
56
 
56
- with av.open(output_filename, "w", format=format) as output_container:
57
+ with av.open(output_filename, 'w', format=format) as output_container:
57
58
  output_stream = output_container.add_stream(codec or default_codec)
58
59
  for packet in container.demux(audio_stream):
59
60
  for frame in packet.decode():
60
61
  output_container.mux(output_stream.encode(frame))
61
62
 
62
63
  return output_filename
64
+
65
+
66
+ @func.udf(return_type=ts.JsonType(nullable=False), param_types=[ts.VideoType(nullable=False)])
67
+ def get_metadata(video: str) -> dict:
68
+ """Gets various metadata associated with a video file.
69
+
70
+ Args:
71
+ video (str): Path to the video file.
72
+
73
+ Returns:
74
+ A dictionary containing the associated metadata.
75
+ """
76
+ with av.open(video) as container:
77
+ assert isinstance(container, av.container.InputContainer)
78
+ video_streams_info = [
79
+ {
80
+ 'duration': stream.duration,
81
+ 'frames': stream.frames,
82
+ 'language': stream.language,
83
+ 'average_rate': float(stream.average_rate) if stream.average_rate is not None else None,
84
+ 'base_rate': float(stream.base_rate) if stream.base_rate is not None else None,
85
+ 'guessed_rate': float(stream.guessed_rate) if stream.guessed_rate is not None else None,
86
+ 'pix_fmt': getattr(stream.codec_context, 'pix_fmt', None),
87
+ 'width': stream.width,
88
+ 'height': stream.height,
89
+ }
90
+ for stream in container.streams
91
+ if isinstance(stream, av.video.stream.VideoStream)
92
+ ]
93
+ result = {
94
+ 'bit_exact': container.bit_exact,
95
+ 'bit_rate': container.bit_rate,
96
+ 'size': container.size,
97
+ 'metadata': container.metadata,
98
+ 'streams': video_streams_info, # TODO: Audio streams?
99
+ }
100
+ return result
pixeltable/globals.py CHANGED
@@ -96,8 +96,8 @@ def create_view(
96
96
  schema: dictionary mapping column names to column types, value expressions, or to column specifications.
97
97
  filter: Predicate to filter rows of the base table.
98
98
  is_snapshot: Whether the view is a snapshot.
99
- iterator_class: Class of the iterator to use for the view.
100
- iterator_args: Arguments to pass to the iterator class.
99
+ iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
100
+ the base table.
101
101
  num_retained_versions: Number of versions of the view to retain.
102
102
  ignore_errors: if True, fail silently if the path already exists or is invalid.
103
103
 
@@ -423,3 +423,21 @@ def get_path(schema_obj: catalog.SchemaObject) -> str:
423
423
  dir_id = dir._dir_id
424
424
  path_elements.append(schema_obj._name)
425
425
  return '.'.join(path_elements)
426
+
427
+
428
+ def configure_logging(
429
+ *,
430
+ to_stdout: Optional[bool] = None,
431
+ level: Optional[int] = None,
432
+ add: Optional[str] = None,
433
+ remove: Optional[str] = None,
434
+ ) -> None:
435
+ """Configure logging.
436
+
437
+ Args:
438
+ to_stdout: if True, also log to stdout
439
+ level: default log level
440
+ add: comma-separated list of 'module name:log level' pairs; ex.: add='video:10'
441
+ remove: comma-separated list of module names
442
+ """
443
+ return Env.get().configure_logging(to_stdout=to_stdout, level=level, add=add, remove=remove)
@@ -1,2 +1,3 @@
1
1
  from .base import IndexBase
2
2
  from .embedding_index import EmbeddingIndex
3
+ from .btree import BtreeIndex
pixeltable/index/base.py CHANGED
@@ -27,7 +27,12 @@ class IndexBase(abc.ABC):
27
27
  pass
28
28
 
29
29
  @abc.abstractmethod
30
- def index_sa_type(self) -> sql.sqltypes.TypeEngine:
30
+ def records_value_errors(self) -> bool:
31
+ """True if index_value_expr() can raise errors"""
32
+ pass
33
+
34
+ @abc.abstractmethod
35
+ def index_sa_type(self) -> sql.types.TypeEngine:
31
36
  """Return the sqlalchemy type of the index value column"""
32
37
  pass
33
38
 
@@ -0,0 +1,54 @@
1
+ from typing import Optional
2
+
3
+ import sqlalchemy as sql
4
+
5
+ # TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
6
+ #import pixeltable.catalog as catalog
7
+ import pixeltable.exceptions as excs
8
+ import pixeltable.func as func
9
+ from .base import IndexBase
10
+
11
+
12
+ class BtreeIndex(IndexBase):
13
+ """
14
+ Interface to B-tree indices in Postgres.
15
+ """
16
+ MAX_STRING_LEN = 256
17
+
18
+ @func.udf
19
+ def str_filter(s: Optional[str]) -> Optional[str]:
20
+ if s is None:
21
+ return None
22
+ return s[:BtreeIndex.MAX_STRING_LEN]
23
+
24
+ def __init__(self, c: 'catalog.Column'):
25
+ if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
26
+ raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
27
+ from pixeltable.exprs import ColumnRef
28
+ self.value_expr = self.str_filter(ColumnRef(c)) if c.col_type.is_string_type() else ColumnRef(c)
29
+
30
+ def index_value_expr(self) -> 'pixeltable.exprs.Expr':
31
+ return self.value_expr
32
+
33
+ def records_value_errors(self) -> bool:
34
+ return False
35
+
36
+ def index_sa_type(self) -> sql.types.TypeEngine:
37
+ """Return the sqlalchemy type of the index value column"""
38
+ return self.value_expr.col_type.to_sa_type()
39
+
40
+ def create_index(self, index_name: str, index_value_col: 'catalog.Column', conn: sql.engine.Connection) -> None:
41
+ """Create the index on the index value column"""
42
+ idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
43
+ idx.create(bind=conn)
44
+
45
+ @classmethod
46
+ def display_name(cls) -> str:
47
+ return 'btree'
48
+
49
+ def as_dict(self) -> dict:
50
+ return {}
51
+
52
+ @classmethod
53
+ def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
54
+ return cls(c)
@@ -70,7 +70,10 @@ class EmbeddingIndex(IndexBase):
70
70
  """Return expression that computes the value that goes into the index"""
71
71
  return self.value_expr
72
72
 
73
- def index_sa_type(self) -> sql.sqltypes.TypeEngine:
73
+ def records_value_errors(self) -> bool:
74
+ return True
75
+
76
+ def index_sa_type(self) -> sql.types.TypeEngine:
74
77
  """Return the sqlalchemy type of the index value column"""
75
78
  return self.index_col_type
76
79
 
pixeltable/io/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ from .globals import create_label_studio_project
1
2
  from .hf_datasets import import_huggingface_dataset
2
3
  from .pandas import import_csv, import_excel, import_pandas
3
4
  from .parquet import import_parquet
@@ -0,0 +1,59 @@
1
+ from typing import Any, Optional, Literal
2
+
3
+ import pixeltable as pxt
4
+ from pixeltable import Table
5
+
6
+
7
+ def create_label_studio_project(
8
+ t: Table,
9
+ label_config: str,
10
+ col_mapping: Optional[dict[str, str]] = None,
11
+ title: Optional[str] = None,
12
+ media_import_method: Literal['post', 'file'] = 'file',
13
+ sync_immediately: bool = True,
14
+ **kwargs: Any
15
+ ) -> None:
16
+ """
17
+ Creates a new Label Studio project and links it to the specified `Table`.
18
+
19
+ The required parameter `label_config` specifies the Label Studio project configuration,
20
+ in XML format, as described in the Label Studio documentation. The linked project will
21
+ have one column for each data field in the configuration; for example, if the
22
+ configuration has an entry
23
+ ```
24
+ <Image name="image_obj" value="$image"/>
25
+ ```
26
+ then the linked project will have a column named `image`. In addition, the linked project
27
+ will always have a JSON-typed column `annotations` representing the output.
28
+
29
+ By default, Pixeltable will link each of these columns to a column of the specified `Table`
30
+ with the same name. If any of the data fields are missing, an exception will be thrown. If
31
+ the `annotations` column is missing, it will be created. The default names can be overridden
32
+ by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
33
+ Studio field names as values.
34
+
35
+ Args:
36
+ t: The Table to link to.
37
+ label_config: The Label Studio project configuration, in XML format.
38
+ col_mapping: An optional mapping of local column names to remote column names.
39
+ title: An optional title for the Label Studio project. If not specified, the
40
+ name of the `Table` will be used as a default.
41
+ sync_immediately: If `True`, immediately perform an initial synchronization by
42
+ importing all rows of the `Table` as Label Studio tasks.
43
+ """
44
+ from pixeltable.datatransfer.label_studio import LabelStudioProject, ANNOTATIONS_COLUMN
45
+
46
+ ls_project = LabelStudioProject.create(title or t.get_name(), label_config, media_import_method, **kwargs)
47
+
48
+ # Create a column to hold the annotations, if one does not yet exist.
49
+ if col_mapping is not None and ANNOTATIONS_COLUMN in col_mapping.values():
50
+ local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
51
+ else:
52
+ local_annotations_column = ANNOTATIONS_COLUMN
53
+ if local_annotations_column not in t.column_names():
54
+ t[local_annotations_column] = pxt.JsonType(nullable=True)
55
+
56
+ # Link the project to `t`, and sync if appropriate.
57
+ t._link(ls_project, col_mapping)
58
+ if sync_immediately:
59
+ t.sync()
@@ -6,11 +6,11 @@ from pixeltable.type_system import ColumnType
6
6
 
7
7
 
8
8
  class ComponentIterator(ABC):
9
- """Base class for iterators."""
9
+ """Base class for Pixeltable iterators."""
10
10
 
11
11
  @classmethod
12
12
  @abstractmethod
13
- def input_schema(cls) -> Dict[str, ColumnType]:
13
+ def input_schema(cls) -> dict[str, ColumnType]:
14
14
  """Provide the Pixeltable types of the init() parameters
15
15
 
16
16
  The keys need to match the names of the init() parameters. This is equivalent to the parameters_types
@@ -20,7 +20,7 @@ class ComponentIterator(ABC):
20
20
 
21
21
  @classmethod
22
22
  @abstractmethod
23
- def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
23
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
24
24
  """Specify the dictionary returned by next() and a list of unstored column names
25
25
 
26
26
  Returns:
@@ -33,7 +33,7 @@ class ComponentIterator(ABC):
33
33
  return self
34
34
 
35
35
  @abstractmethod
36
- def __next__(self) -> Dict[str, Any]:
36
+ def __next__(self) -> dict[str, Any]:
37
37
  """Return the next element of the iterator as a dictionary or raise StopIteration"""
38
38
  raise NotImplementedError
39
39
 
@@ -13,6 +13,7 @@ from .base import ComponentIterator
13
13
 
14
14
  _logger = logging.getLogger('pixeltable')
15
15
 
16
+
16
17
  class ChunkMetadata(enum.Enum):
17
18
  TITLE = 1
18
19
  HEADING = 2
@@ -20,6 +21,7 @@ class ChunkMetadata(enum.Enum):
20
21
  PAGE = 4
21
22
  BOUNDING_BOX = 5
22
23
 
24
+
23
25
  class Separator(enum.Enum):
24
26
  HEADING = 1
25
27
  PARAGRAPH = 2
@@ -28,6 +30,7 @@ class Separator(enum.Enum):
28
30
  CHAR_LIMIT = 5
29
31
  PAGE = 6
30
32
 
33
+
31
34
  @dataclasses.dataclass
32
35
  class DocumentSectionMetadata:
33
36
  """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
@@ -42,6 +45,7 @@ class DocumentSectionMetadata:
42
45
  # bounding box as an {x1, y1, x2, y2} dictionary
43
46
  bounding_box: Optional[Dict[str, float]] = None
44
47
 
48
+
45
49
  @dataclasses.dataclass
46
50
  class DocumentSection:
47
51
  """A single document chunk, according to some of the splitting criteria"""
@@ -79,20 +83,14 @@ def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
79
83
 
80
84
  _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
81
85
 
86
+
82
87
  class DocumentSplitter(ComponentIterator):
83
- """Iterator over pieces of a document. The document is split into chunks based on the specified separators.
84
- The iterator output tuples are of schema {'text': StringType()}, but can include additional metadata fields if specified
85
- in the `metadata` argument as explained below.
86
- All chunk text is passed through `ftfy.fix_text` to fix up common problems with unicode sequences.
87
-
88
- Args:
89
- `metadata`: which additional metadata fields to include in the output schema:
90
- 'title', 'heading' (HTML and Markdown), 'sourceline' (HTML), 'page' (PDF), 'bounding_box' (PDF).
91
- The input can be a comma-separated string of these values eg. 'title,heading,sourceline'.
92
- `separators`: which separators to use to split the document into rows. Options are:
93
- 'heading', 'paragraph', 'sentence', 'token_limit', 'char_limit', 'page'. As with metadata, this is can be a
94
- comma-separated string eg. 'heading, token_limit'.
95
- `limit`: the maximum number of tokens or characters in each chunk if 'token_limit' or 'char_limit' is specified.
88
+ """Iterator over chunks of a document. The document is chunked according to the specified `separators`.
89
+
90
+ The iterator yields a `text` field containing the text of the chunk, and it may also
91
+ include additional metadata fields if specified in the `metadata` parameter, as explained below.
92
+
93
+ Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
96
94
  """
97
95
  METADATA_COLUMN_TYPES = {
98
96
  ChunkMetadata.TITLE: StringType(nullable=True),
@@ -103,10 +101,23 @@ class DocumentSplitter(ComponentIterator):
103
101
  }
104
102
 
105
103
  def __init__(
106
- self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None, metadata: str = '',
107
- html_skip_tags: Optional[List[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
104
+ self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None,
105
+ metadata: str = '',
106
+ html_skip_tags: Optional[list[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
108
107
  tiktoken_target_model: Optional[str] = None
109
108
  ):
109
+ """Init method for `DocumentSplitter` class.
110
+
111
+ Args:
112
+ separators: separators to use to chunk the document. Options are:
113
+ `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
114
+ This may be a comma-separated string, e.g., `'heading,token_limit'`.
115
+ limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
116
+ or `'char_limit'` is specified.
117
+ metadata: additional metadata fields to include in the output. Options are:
118
+ `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
119
+ (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
120
+ """
110
121
  if html_skip_tags is None:
111
122
  html_skip_tags = ['nav']
112
123
  self._doc_handle = get_document_handle(document)