pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,12 +1,13 @@
1
1
  import logging
2
- import uuid
3
2
  from fractions import Fraction
4
3
  from pathlib import Path
5
- from typing import Any, ClassVar, Optional
4
+ from typing import Any, ClassVar
6
5
 
7
6
  import av
7
+ from deprecated import deprecated
8
8
 
9
- from pixeltable import env, exceptions as excs, type_system as ts
9
+ from pixeltable import exceptions as excs, type_system as ts
10
+ from pixeltable.utils.local_store import TempStore
10
11
 
11
12
  from .base import ComponentIterator
12
13
 
@@ -14,18 +15,6 @@ _logger = logging.getLogger('pixeltable')
14
15
 
15
16
 
16
17
  class AudioSplitter(ComponentIterator):
17
- """
18
- Iterator over chunks of an audio file. The audio file is split into smaller chunks,
19
- where the duration of each chunk is determined by chunk_duration_sec.
20
- The iterator yields audio chunks as pxt.Audio, along with the start and end time of each chunk.
21
- If the input contains no audio, no chunks are yielded.
22
-
23
- Args:
24
- chunk_duration_sec: Audio chunk duration in seconds
25
- overlap_sec: Overlap between consecutive chunks in seconds.
26
- min_chunk_duration_sec: Drop the last chunk if it is smaller than min_chunk_duration_sec
27
- """
28
-
29
18
  # Input parameters
30
19
  audio_path: Path
31
20
  chunk_duration_sec: float
@@ -37,7 +26,7 @@ class AudioSplitter(ComponentIterator):
37
26
 
38
27
  # List of chunks to extract
39
28
  # Each chunk is defined by start and end presentation timestamps in audio file (int)
40
- chunks_to_extract_in_pts: Optional[list[tuple[int, int]]]
29
+ chunks_to_extract_in_pts: list[tuple[int, int]] | None
41
30
  # next chunk to extract
42
31
  next_pos: int
43
32
 
@@ -55,12 +44,9 @@ class AudioSplitter(ComponentIterator):
55
44
  def __init__(
56
45
  self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
57
46
  ):
58
- if chunk_duration_sec <= 0.0:
59
- raise excs.Error('chunk_duration_sec must be a positive number')
60
- if chunk_duration_sec < min_chunk_duration_sec:
61
- raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
62
- if overlap_sec >= chunk_duration_sec:
63
- raise excs.Error('overlap_sec must be less than chunk_duration_sec')
47
+ assert chunk_duration_sec > 0.0
48
+ assert chunk_duration_sec >= min_chunk_duration_sec
49
+ assert overlap_sec < chunk_duration_sec
64
50
  audio_path = Path(audio)
65
51
  assert audio_path.exists() and audio_path.is_file()
66
52
  self.audio_path = audio_path
@@ -128,6 +114,19 @@ class AudioSplitter(ComponentIterator):
128
114
 
129
115
  @classmethod
130
116
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
117
+ param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
118
+ params = dict(zip(param_names, args))
119
+ params.update(kwargs)
120
+
121
+ chunk_duration_sec = params['chunk_duration_sec']
122
+ min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
123
+ overlap_sec = params.get('overlap_sec', 0.0)
124
+ if chunk_duration_sec <= 0.0:
125
+ raise excs.Error('chunk_duration_sec must be a positive number')
126
+ if chunk_duration_sec < min_chunk_duration_sec:
127
+ raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
128
+ if overlap_sec >= chunk_duration_sec:
129
+ raise excs.Error('overlap_sec must be less than chunk_duration_sec')
131
130
  return {
132
131
  'start_time_sec': ts.FloatType(),
133
132
  'end_time_sec': ts.FloatType(),
@@ -140,7 +139,7 @@ class AudioSplitter(ComponentIterator):
140
139
  target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
141
140
  chunk_start_pts = 0
142
141
  chunk_end_pts = 0
143
- chunk_file = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}{self.audio_path.suffix}')
142
+ chunk_file = str(TempStore.create_path(extension=self.audio_path.suffix))
144
143
  output_container = av.open(chunk_file, mode='w')
145
144
  input_stream = self.container.streams.audio[0]
146
145
  codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
@@ -202,5 +201,7 @@ class AudioSplitter(ComponentIterator):
202
201
  def close(self) -> None:
203
202
  self.container.close()
204
203
 
205
- def set_pos(self, pos: int) -> None:
206
- pass
204
+ @classmethod
205
+ @deprecated('create() is deprecated; use `pixeltable.functions.audio.audio_splitter` instead', version='0.5.6')
206
+ def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
207
+ return super()._create(**kwargs)
@@ -43,11 +43,17 @@ class ComponentIterator(ABC):
43
43
  """Close the iterator and release all resources"""
44
44
  raise NotImplementedError
45
45
 
46
- @abstractmethod
47
- def set_pos(self, pos: int) -> None:
46
+ def set_pos(self, pos: int, **kwargs: Any) -> None:
48
47
  """Set the iterator position to pos"""
49
- raise NotImplementedError
48
+ pass
50
49
 
51
50
  @classmethod
52
51
  def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
52
+ # TODO: This is still needed for compatibility with existing user-defined iterators; it will become deprecated
53
+ # when the new decorator pattern is introduced for iterators
54
+ return cls._create(**kwargs)
55
+
56
+ @classmethod
57
+ def _create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
58
+ # create() variant that can be called by subclasses without generating a deprecation warning.
53
59
  return cls, kwargs
@@ -1,13 +1,17 @@
1
1
  import dataclasses
2
2
  import enum
3
3
  import logging
4
- from typing import Any, ClassVar, Iterable, Iterator, Optional, Union
4
+ from typing import Any, ClassVar, Iterable, Iterator, Literal
5
5
 
6
6
  import ftfy
7
+ import PIL.Image
8
+ from bs4.element import NavigableString, Tag
9
+ from deprecated import deprecated
10
+ from pypdfium2 import PdfDocument # type: ignore[import-untyped]
7
11
 
8
12
  from pixeltable.env import Env
9
13
  from pixeltable.exceptions import Error
10
- from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
14
+ from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
11
15
  from pixeltable.utils.documents import get_document_handle
12
16
 
13
17
  from .base import ComponentIterator
@@ -15,6 +19,11 @@ from .base import ComponentIterator
15
19
  _logger = logging.getLogger('pixeltable')
16
20
 
17
21
 
22
+ class Element(enum.Enum):
23
+ TEXT = 1
24
+ IMAGE = 2
25
+
26
+
18
27
  class ChunkMetadata(enum.Enum):
19
28
  TITLE = 1
20
29
  HEADING = 2
@@ -37,27 +46,28 @@ class DocumentSectionMetadata:
37
46
  """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
38
47
 
39
48
  # html and markdown metadata
40
- sourceline: Optional[int] = None
49
+ sourceline: int | None = None
41
50
  # the stack of headings up to the most recently observed one;
42
51
  # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
43
- heading: Optional[dict[str, str]] = None
52
+ heading: dict[str, str] | None = None
44
53
 
45
54
  # pdf-specific metadata
46
- page: Optional[int] = None
55
+ page: int | None = None
47
56
  # bounding box as an {x1, y1, x2, y2} dictionary
48
- bounding_box: Optional[dict[str, float]] = None
57
+ bounding_box: dict[str, float] | None = None
49
58
 
50
59
 
51
60
  @dataclasses.dataclass
52
61
  class DocumentSection:
53
62
  """A single document chunk, according to some of the splitting criteria"""
54
63
 
55
- text: Optional[str]
56
- metadata: Optional[DocumentSectionMetadata]
64
+ text: str | None = None
65
+ image: PIL.Image.Image | None = None
66
+ metadata: DocumentSectionMetadata | None = None
57
67
 
58
68
 
59
69
  def _parse_separators(separators: str) -> list[Separator]:
60
- ret = []
70
+ ret: list[Separator] = []
61
71
  for s in separators.split(','):
62
72
  clean_s = s.strip().upper()
63
73
  if not clean_s:
@@ -71,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
71
81
 
72
82
 
73
83
  def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
74
- ret = []
84
+ ret: list[ChunkMetadata] = []
75
85
  for m in metadata.split(','):
76
86
  clean_m = m.strip().upper()
77
87
  if not clean_m:
@@ -84,18 +94,22 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
84
94
  return ret
85
95
 
86
96
 
87
- _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
88
-
97
+ def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
98
+ result: list[Element] = []
99
+ for e in elements:
100
+ clean_e = e.strip().upper()
101
+ if clean_e not in Element.__members__:
102
+ raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
103
+ result.append(Element[clean_e])
104
+ if len(result) == 0:
105
+ raise Error('elements cannot be empty')
106
+ return result
89
107
 
90
- class DocumentSplitter(ComponentIterator):
91
- """Iterator over chunks of a document. The document is chunked according to the specified `separators`.
92
108
 
93
- The iterator yields a `text` field containing the text of the chunk, and it may also
94
- include additional metadata fields if specified in the `metadata` parameter, as explained below.
109
+ _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
95
110
 
96
- Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
97
- """
98
111
 
112
+ class DocumentSplitter(ComponentIterator):
99
113
  METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
100
114
  ChunkMetadata.TITLE: StringType(nullable=True),
101
115
  ChunkMetadata.HEADING: JsonType(nullable=True),
@@ -104,36 +118,41 @@ class DocumentSplitter(ComponentIterator):
104
118
  ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
105
119
  }
106
120
 
121
+ _doc_handle: Any
122
+ _separators: list[Separator]
123
+ _elements: list[Element]
124
+ _metadata_fields: list[ChunkMetadata]
125
+ _doc_title: str
126
+ _limit: int
127
+ _skip_tags: list[str]
128
+ _overlap: int
129
+ _tiktoken_encoding: str | None
130
+ _tiktoken_target_model: str | None
131
+ _image_dpi: int
132
+ _image_format: str
133
+
134
+ _sections: Iterator[DocumentSection]
135
+
107
136
  def __init__(
108
137
  self,
109
138
  document: str,
110
139
  *,
111
140
  separators: str,
112
- limit: Optional[int] = None,
113
- overlap: Optional[int] = None,
141
+ elements: list[Literal['text', 'image']] | None = None,
142
+ limit: int | None = None,
143
+ overlap: int | None = None,
114
144
  metadata: str = '',
115
- html_skip_tags: Optional[list[str]] = None,
116
- tiktoken_encoding: Optional[str] = 'cl100k_base',
117
- tiktoken_target_model: Optional[str] = None,
145
+ skip_tags: list[str] | None = None,
146
+ tiktoken_encoding: str | None = 'cl100k_base',
147
+ tiktoken_target_model: str | None = None,
148
+ image_dpi: int = 300,
149
+ image_format: str = 'png',
118
150
  ):
119
- """Init method for `DocumentSplitter` class.
120
-
121
- Args:
122
- separators: separators to use to chunk the document. Options are:
123
- `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
124
- This may be a comma-separated string, e.g., `'heading,token_limit'`.
125
- limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
126
- or `'char_limit'` is specified.
127
- metadata: additional metadata fields to include in the output. Options are:
128
- `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
129
- (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
130
- """
131
- if html_skip_tags is None:
132
- html_skip_tags = ['nav']
151
+ if skip_tags is None:
152
+ skip_tags = ['nav']
133
153
  self._doc_handle = get_document_handle(document)
154
+ self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
134
155
  assert self._doc_handle is not None
135
- # calling the output_schema method to validate the input arguments
136
- self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
137
156
  self._separators = _parse_separators(separators)
138
157
  self._metadata_fields = _parse_metadata(metadata)
139
158
  if self._doc_handle.bs_doc is not None:
@@ -145,10 +164,12 @@ class DocumentSplitter(ComponentIterator):
145
164
  else:
146
165
  self._doc_title = ''
147
166
  self._limit = 0 if limit is None else limit
148
- self._skip_tags = html_skip_tags
167
+ self._skip_tags = skip_tags
149
168
  self._overlap = 0 if overlap is None else overlap
150
169
  self._tiktoken_encoding = tiktoken_encoding
151
170
  self._tiktoken_target_model = tiktoken_target_model
171
+ self._image_dpi = image_dpi
172
+ self._image_format = image_format
152
173
 
153
174
  # set up processing pipeline
154
175
  if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -178,19 +199,28 @@ class DocumentSplitter(ComponentIterator):
178
199
  return {
179
200
  'document': DocumentType(nullable=False),
180
201
  'separators': StringType(nullable=False),
202
+ 'elements': JsonType(nullable=False),
181
203
  'metadata': StringType(nullable=False),
182
204
  'limit': IntType(nullable=True),
183
205
  'overlap': IntType(nullable=True),
184
206
  'skip_tags': StringType(nullable=True),
185
207
  'tiktoken_encoding': StringType(nullable=True),
186
208
  'tiktoken_target_model': StringType(nullable=True),
209
+ 'image_dpi': IntType(nullable=True),
210
+ 'image_format': StringType(nullable=True),
187
211
  }
188
212
 
189
213
  @classmethod
190
214
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
191
- schema: dict[str, ColumnType] = {'text': StringType()}
192
- md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
193
-
215
+ schema: dict[str, ColumnType] = {}
216
+ elements = _parse_elements(kwargs.get('elements', ['text']))
217
+ for element in elements:
218
+ if element == Element.TEXT:
219
+ schema['text'] = StringType(nullable=False)
220
+ elif element == Element.IMAGE:
221
+ schema['image'] = ImageType(nullable=False)
222
+
223
+ md_fields = _parse_metadata(kwargs.get('metadata', ''))
194
224
  for md_field in md_fields:
195
225
  schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
196
226
 
@@ -200,6 +230,8 @@ class DocumentSplitter(ComponentIterator):
200
230
  limit = kwargs.get('limit')
201
231
  overlap = kwargs.get('overlap')
202
232
 
233
+ if Element.IMAGE in elements and separators != [Separator.PAGE]:
234
+ raise Error('Image elements are only supported for the "page" separator on PDF documents')
203
235
  if limit is not None or overlap is not None:
204
236
  if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
205
237
  raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -213,7 +245,6 @@ class DocumentSplitter(ComponentIterator):
213
245
  if kwargs.get('limit') is None:
214
246
  raise Error('limit is required with "token_limit"/"char_limit" separators')
215
247
 
216
- # check dependencies at the end
217
248
  if Separator.SENTENCE in separators:
218
249
  _ = Env.get().spacy_nlp
219
250
  if Separator.TOKEN_LIMIT in separators:
@@ -224,9 +255,15 @@ class DocumentSplitter(ComponentIterator):
224
255
  def __next__(self) -> dict[str, Any]:
225
256
  while True:
226
257
  section = next(self._sections)
227
- if section.text is None:
258
+ if section.text is None and section.image is None:
228
259
  continue
229
- result: dict[str, Any] = {'text': section.text}
260
+ result: dict[str, Any] = {}
261
+ for element in self._elements:
262
+ if element == Element.TEXT:
263
+ result['text'] = section.text
264
+ elif element == Element.IMAGE:
265
+ result['image'] = section.image
266
+
230
267
  for md_field in self._metadata_fields:
231
268
  if md_field == ChunkMetadata.TITLE:
232
269
  result[md_field.name.lower()] = self._doc_title
@@ -238,6 +275,7 @@ class DocumentSplitter(ComponentIterator):
238
275
  result[md_field.name.lower()] = section.metadata.page
239
276
  elif md_field == ChunkMetadata.BOUNDING_BOX:
240
277
  result[md_field.name.lower()] = section.metadata.bounding_box
278
+
241
279
  return result
242
280
 
243
281
  def _html_sections(self) -> Iterator[DocumentSection]:
@@ -273,7 +311,7 @@ class DocumentSplitter(ComponentIterator):
273
311
  yield DocumentSection(text=full_text, metadata=md)
274
312
  accumulated_text = []
275
313
 
276
- def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
314
+ def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
277
315
  # process the element and emit sections as necessary
278
316
  nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
279
317
 
@@ -361,43 +399,35 @@ class DocumentSplitter(ComponentIterator):
361
399
  yield from emit()
362
400
 
363
401
  def _pdf_sections(self) -> Iterator[DocumentSection]:
364
- """Create DocumentSections reflecting the pdf-specific separators"""
365
- import fitz # type: ignore[import-untyped]
366
-
367
- doc: fitz.Document = self._doc_handle.pdf_doc
368
- assert doc is not None
402
+ if Separator.PARAGRAPH in self._separators:
403
+ raise Error(
404
+ 'Paragraph splitting is not currently supported for PDF documents. Please contact'
405
+ ' us at https://github.com/pixeltable/pixeltable/issues if you need this feature.'
406
+ )
369
407
 
370
- emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
371
- emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
408
+ doc: PdfDocument = self._doc_handle.pdf_doc
409
+ assert isinstance(doc, PdfDocument)
372
410
 
373
- accumulated_text = [] # invariant: all elements are ftfy clean and non-empty
411
+ emit_on_page = Separator.PAGE in self._separators
412
+ accumulated_text: list[str] = []
374
413
 
375
- def _add_cleaned_text(raw_text: str) -> None:
376
- fixed = ftfy.fix_text(raw_text)
414
+ def _add_cleaned(raw: str) -> None:
415
+ fixed = ftfy.fix_text(raw)
377
416
  if fixed:
378
417
  accumulated_text.append(fixed)
379
418
 
380
419
  def _emit_text() -> str:
381
- full_text = ''.join(accumulated_text)
420
+ txt = ''.join(accumulated_text)
382
421
  accumulated_text.clear()
383
- return full_text
384
-
385
- for page_number, page in enumerate(doc.pages()):
386
- for block in page.get_text('blocks'):
387
- # there is no concept of paragraph in pdf, block is the closest thing
388
- # we can get (eg a paragraph in text may cut across pages)
389
- # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
390
- # other libraries like pdfminer also lack an explicit paragraph concept
391
- x1, y1, x2, y2, text, _, _ = block
392
- _add_cleaned_text(text)
393
- if accumulated_text and emit_on_paragraph:
394
- bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
395
- metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
396
- yield DocumentSection(text=_emit_text(), metadata=metadata)
397
-
398
- if accumulated_text and emit_on_page and not emit_on_paragraph:
399
- yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata(page=page_number))
400
- accumulated_text = []
422
+ return txt
423
+
424
+ for page_idx, page in enumerate(doc):
425
+ img = page.render().to_pil() if Element.IMAGE in self._elements else None
426
+ text = page.get_textpage().get_text_bounded()
427
+ _add_cleaned(text)
428
+ if accumulated_text and emit_on_page:
429
+ md = DocumentSectionMetadata(page=page_idx)
430
+ yield DocumentSection(text=_emit_text(), image=img, metadata=md)
401
431
 
402
432
  if accumulated_text and not emit_on_page:
403
433
  yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
@@ -465,5 +495,9 @@ class DocumentSplitter(ComponentIterator):
465
495
  def close(self) -> None:
466
496
  pass
467
497
 
468
- def set_pos(self, pos: int) -> None:
469
- pass
498
+ @classmethod
499
+ @deprecated(
500
+ 'create() is deprecated; use `pixeltable.functions.document.document_splitter` instead', version='0.5.6'
501
+ )
502
+ def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
503
+ return super()._create(**kwargs)
@@ -1,6 +1,7 @@
1
1
  from typing import Any, Sequence
2
2
 
3
3
  import PIL.Image
4
+ from deprecated import deprecated
4
5
 
5
6
  import pixeltable.exceptions as excs
6
7
  import pixeltable.type_system as ts
@@ -8,18 +9,6 @@ from pixeltable.iterators.base import ComponentIterator
8
9
 
9
10
 
10
11
  class TileIterator(ComponentIterator):
11
- """
12
- Iterator over tiles of an image. Each image will be divided into tiles of size `tile_size`, and the tiles will be
13
- iterated over in row-major order (left-to-right, then top-to-bottom). An optional `overlap` parameter may be
14
- specified. If the tiles do not exactly cover the image, then the rightmost and bottommost tiles will be padded with
15
- blackspace, so that the output images all have the exact size `tile_size`.
16
-
17
- Args:
18
- image: Image to split into tiles.
19
- tile_size: Size of each tile, as a pair of integers `[width, height]`.
20
- overlap: Amount of overlap between adjacent tiles, as a pair of integers `[width, height]`.
21
- """
22
-
23
12
  __image: PIL.Image.Image
24
13
  __tile_size: Sequence[int]
25
14
  __overlap: Sequence[int]
@@ -31,8 +20,7 @@ class TileIterator(ComponentIterator):
31
20
  __j: int
32
21
 
33
22
  def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
34
- if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
35
- raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
23
+ assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
36
24
 
37
25
  self.__image = image
38
26
  self.__image.load()
@@ -69,7 +57,7 @@ class TileIterator(ComponentIterator):
69
57
  def close(self) -> None:
70
58
  pass
71
59
 
72
- def set_pos(self, pos: int) -> None:
60
+ def set_pos(self, pos: int, **kwargs: Any) -> None:
73
61
  self.__j = pos // self.__xlen
74
62
  self.__i = pos % self.__xlen
75
63
 
@@ -79,4 +67,13 @@ class TileIterator(ComponentIterator):
79
67
 
80
68
  @classmethod
81
69
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
70
+ tile_size = kwargs.get('tile_size')
71
+ overlap = kwargs.get('overlap', (0, 0))
72
+ if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
73
+ raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
82
74
  return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
75
+
76
+ @classmethod
77
+ @deprecated('create() is deprecated; use `pixeltable.functions.image.tile_iterator` instead', version='0.5.6')
78
+ def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
79
+ return super()._create(**kwargs)
@@ -1,12 +1,17 @@
1
1
  from typing import Any, Iterator
2
2
 
3
+ from deprecated import deprecated
4
+
3
5
  from pixeltable import exceptions as excs, type_system as ts
4
6
  from pixeltable.env import Env
5
7
  from pixeltable.iterators.base import ComponentIterator
6
8
 
7
9
 
8
10
  class StringSplitter(ComponentIterator):
9
- # TODO(aaron-siegel): Merge this with `DocumentSplitter` in order to provide additional capabilities.
11
+ _text: str
12
+ doc: Any # spacy doc
13
+ iter: Iterator[dict[str, Any]]
14
+
10
15
  def __init__(self, text: str, *, separators: str):
11
16
  if separators != 'sentence':
12
17
  raise excs.Error('Only `sentence` separators are currently supported.')
@@ -24,9 +29,6 @@ class StringSplitter(ComponentIterator):
24
29
  def close(self) -> None:
25
30
  pass
26
31
 
27
- def set_pos(self, pos: int) -> None:
28
- pass
29
-
30
32
  @classmethod
31
33
  def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
32
34
  return {'text': ts.StringType(), 'separators': ts.StringType()}
@@ -34,3 +36,8 @@ class StringSplitter(ComponentIterator):
34
36
  @classmethod
35
37
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
36
38
  return {'text': ts.StringType()}, []
39
+
40
+ @classmethod
41
+ @deprecated('create() is deprecated; use `pixeltable.functions.string.string_splitter` instead', version='0.5.6')
42
+ def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
43
+ return super()._create(**kwargs)