pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -1,13 +1,17 @@
1
1
  import dataclasses
2
2
  import enum
3
3
  import logging
4
- from typing import Any, Iterable, Iterator, Optional, Union
4
+ from typing import Any, ClassVar, Iterable, Iterator, Literal
5
5
 
6
6
  import ftfy
7
+ import PIL.Image
8
+ from bs4.element import NavigableString, Tag
9
+ from deprecated import deprecated
10
+ from pypdfium2 import PdfDocument # type: ignore[import-untyped]
7
11
 
8
12
  from pixeltable.env import Env
9
13
  from pixeltable.exceptions import Error
10
- from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
14
+ from pixeltable.type_system import ColumnType, DocumentType, ImageType, IntType, JsonType, StringType
11
15
  from pixeltable.utils.documents import get_document_handle
12
16
 
13
17
  from .base import ComponentIterator
@@ -15,6 +19,11 @@ from .base import ComponentIterator
15
19
  _logger = logging.getLogger('pixeltable')
16
20
 
17
21
 
22
+ class Element(enum.Enum):
23
+ TEXT = 1
24
+ IMAGE = 2
25
+
26
+
18
27
  class ChunkMetadata(enum.Enum):
19
28
  TITLE = 1
20
29
  HEADING = 2
@@ -35,27 +44,30 @@ class Separator(enum.Enum):
35
44
  @dataclasses.dataclass
36
45
  class DocumentSectionMetadata:
37
46
  """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
47
+
38
48
  # html and markdown metadata
39
- sourceline: Optional[int] = None
49
+ sourceline: int | None = None
40
50
  # the stack of headings up to the most recently observed one;
41
51
  # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
42
- heading: Optional[dict[str, str]] = None
52
+ heading: dict[str, str] | None = None
43
53
 
44
54
  # pdf-specific metadata
45
- page: Optional[int] = None
55
+ page: int | None = None
46
56
  # bounding box as an {x1, y1, x2, y2} dictionary
47
- bounding_box: Optional[dict[str, float]] = None
57
+ bounding_box: dict[str, float] | None = None
48
58
 
49
59
 
50
60
  @dataclasses.dataclass
51
61
  class DocumentSection:
52
62
  """A single document chunk, according to some of the splitting criteria"""
53
- text: Optional[str]
54
- metadata: Optional[DocumentSectionMetadata]
63
+
64
+ text: str | None = None
65
+ image: PIL.Image.Image | None = None
66
+ metadata: DocumentSectionMetadata | None = None
55
67
 
56
68
 
57
69
  def _parse_separators(separators: str) -> list[Separator]:
58
- ret = []
70
+ ret: list[Separator] = []
59
71
  for s in separators.split(','):
60
72
  clean_s = s.strip().upper()
61
73
  if not clean_s:
@@ -69,7 +81,7 @@ def _parse_separators(separators: str) -> list[Separator]:
69
81
 
70
82
 
71
83
  def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
72
- ret = []
84
+ ret: list[ChunkMetadata] = []
73
85
  for m in metadata.split(','):
74
86
  clean_m = m.strip().upper()
75
87
  if not clean_m:
@@ -82,18 +94,23 @@ def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
82
94
  return ret
83
95
 
84
96
 
85
- _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
97
+ def _parse_elements(elements: list[Literal['text', 'image']]) -> list[Element]:
98
+ result: list[Element] = []
99
+ for e in elements:
100
+ clean_e = e.strip().upper()
101
+ if clean_e not in Element.__members__:
102
+ raise Error(f'Invalid element: `{e}`. Valid elements are: {", ".join(Element.__members__).lower()}')
103
+ result.append(Element[clean_e])
104
+ if len(result) == 0:
105
+ raise Error('elements cannot be empty')
106
+ return result
86
107
 
87
108
 
88
- class DocumentSplitter(ComponentIterator):
89
- """Iterator over chunks of a document. The document is chunked according to the specified `separators`.
109
+ _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
90
110
 
91
- The iterator yields a `text` field containing the text of the chunk, and it may also
92
- include additional metadata fields if specified in the `metadata` parameter, as explained below.
93
111
 
94
- Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
95
- """
96
- METADATA_COLUMN_TYPES = {
112
+ class DocumentSplitter(ComponentIterator):
113
+ METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
97
114
  ChunkMetadata.TITLE: StringType(nullable=True),
98
115
  ChunkMetadata.HEADING: JsonType(nullable=True),
99
116
  ChunkMetadata.SOURCELINE: IntType(nullable=True),
@@ -101,30 +118,41 @@ class DocumentSplitter(ComponentIterator):
101
118
  ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
102
119
  }
103
120
 
121
+ _doc_handle: Any
122
+ _separators: list[Separator]
123
+ _elements: list[Element]
124
+ _metadata_fields: list[ChunkMetadata]
125
+ _doc_title: str
126
+ _limit: int
127
+ _skip_tags: list[str]
128
+ _overlap: int
129
+ _tiktoken_encoding: str | None
130
+ _tiktoken_target_model: str | None
131
+ _image_dpi: int
132
+ _image_format: str
133
+
134
+ _sections: Iterator[DocumentSection]
135
+
104
136
  def __init__(
105
- self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None,
106
- metadata: str = '',
107
- html_skip_tags: Optional[list[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
108
- tiktoken_target_model: Optional[str] = None
137
+ self,
138
+ document: str,
139
+ *,
140
+ separators: str,
141
+ elements: list[Literal['text', 'image']] | None = None,
142
+ limit: int | None = None,
143
+ overlap: int | None = None,
144
+ metadata: str = '',
145
+ skip_tags: list[str] | None = None,
146
+ tiktoken_encoding: str | None = 'cl100k_base',
147
+ tiktoken_target_model: str | None = None,
148
+ image_dpi: int = 300,
149
+ image_format: str = 'png',
109
150
  ):
110
- """Init method for `DocumentSplitter` class.
111
-
112
- Args:
113
- separators: separators to use to chunk the document. Options are:
114
- `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
115
- This may be a comma-separated string, e.g., `'heading,token_limit'`.
116
- limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
117
- or `'char_limit'` is specified.
118
- metadata: additional metadata fields to include in the output. Options are:
119
- `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
120
- (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
121
- """
122
- if html_skip_tags is None:
123
- html_skip_tags = ['nav']
151
+ if skip_tags is None:
152
+ skip_tags = ['nav']
124
153
  self._doc_handle = get_document_handle(document)
154
+ self._elements = _parse_elements(elements.copy()) if elements is not None else [Element.TEXT]
125
155
  assert self._doc_handle is not None
126
- # calling the output_schema method to validate the input arguments
127
- self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
128
156
  self._separators = _parse_separators(separators)
129
157
  self._metadata_fields = _parse_metadata(metadata)
130
158
  if self._doc_handle.bs_doc is not None:
@@ -136,10 +164,12 @@ class DocumentSplitter(ComponentIterator):
136
164
  else:
137
165
  self._doc_title = ''
138
166
  self._limit = 0 if limit is None else limit
139
- self._skip_tags = html_skip_tags
167
+ self._skip_tags = skip_tags
140
168
  self._overlap = 0 if overlap is None else overlap
141
169
  self._tiktoken_encoding = tiktoken_encoding
142
170
  self._tiktoken_target_model = tiktoken_target_model
171
+ self._image_dpi = image_dpi
172
+ self._image_format = image_format
143
173
 
144
174
  # set up processing pipeline
145
175
  if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
@@ -151,8 +181,11 @@ class DocumentSplitter(ComponentIterator):
151
181
  elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
152
182
  assert self._doc_handle.pdf_doc is not None
153
183
  self._sections = self._pdf_sections()
184
+ elif self._doc_handle.format == DocumentType.DocumentFormat.TXT:
185
+ assert self._doc_handle.txt_doc is not None
186
+ self._sections = self._txt_sections()
154
187
  else:
155
- assert False, f'Unsupported document format: {self._doc_handle.format}'
188
+ raise AssertionError(f'Unsupported document format: {self._doc_handle.format}')
156
189
 
157
190
  if Separator.SENTENCE in self._separators:
158
191
  self._sections = self._sentence_sections(self._sections)
@@ -166,19 +199,28 @@ class DocumentSplitter(ComponentIterator):
166
199
  return {
167
200
  'document': DocumentType(nullable=False),
168
201
  'separators': StringType(nullable=False),
202
+ 'elements': JsonType(nullable=False),
169
203
  'metadata': StringType(nullable=False),
170
204
  'limit': IntType(nullable=True),
171
205
  'overlap': IntType(nullable=True),
172
206
  'skip_tags': StringType(nullable=True),
173
207
  'tiktoken_encoding': StringType(nullable=True),
174
208
  'tiktoken_target_model': StringType(nullable=True),
209
+ 'image_dpi': IntType(nullable=True),
210
+ 'image_format': StringType(nullable=True),
175
211
  }
176
212
 
177
213
  @classmethod
178
214
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
179
- schema: dict[str, ColumnType] = {'text': StringType()}
180
- md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
181
-
215
+ schema: dict[str, ColumnType] = {}
216
+ elements = _parse_elements(kwargs.get('elements', ['text']))
217
+ for element in elements:
218
+ if element == Element.TEXT:
219
+ schema['text'] = StringType(nullable=False)
220
+ elif element == Element.IMAGE:
221
+ schema['image'] = ImageType(nullable=False)
222
+
223
+ md_fields = _parse_metadata(kwargs.get('metadata', ''))
182
224
  for md_field in md_fields:
183
225
  schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
184
226
 
@@ -188,6 +230,8 @@ class DocumentSplitter(ComponentIterator):
188
230
  limit = kwargs.get('limit')
189
231
  overlap = kwargs.get('overlap')
190
232
 
233
+ if Element.IMAGE in elements and separators != [Separator.PAGE]:
234
+ raise Error('Image elements are only supported for the "page" separator on PDF documents')
191
235
  if limit is not None or overlap is not None:
192
236
  if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
193
237
  raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
@@ -201,9 +245,8 @@ class DocumentSplitter(ComponentIterator):
201
245
  if kwargs.get('limit') is None:
202
246
  raise Error('limit is required with "token_limit"/"char_limit" separators')
203
247
 
204
- # check dependencies at the end
205
248
  if Separator.SENTENCE in separators:
206
- Env.get().require_package('spacy')
249
+ _ = Env.get().spacy_nlp
207
250
  if Separator.TOKEN_LIMIT in separators:
208
251
  Env.get().require_package('tiktoken')
209
252
 
@@ -212,9 +255,15 @@ class DocumentSplitter(ComponentIterator):
212
255
  def __next__(self) -> dict[str, Any]:
213
256
  while True:
214
257
  section = next(self._sections)
215
- if section.text is None:
258
+ if section.text is None and section.image is None:
216
259
  continue
217
- result: dict[str, Any] = {'text': section.text}
260
+ result: dict[str, Any] = {}
261
+ for element in self._elements:
262
+ if element == Element.TEXT:
263
+ result['text'] = section.text
264
+ elif element == Element.IMAGE:
265
+ result['image'] = section.image
266
+
218
267
  for md_field in self._metadata_fields:
219
268
  if md_field == ChunkMetadata.TITLE:
220
269
  result[md_field.name.lower()] = self._doc_title
@@ -226,18 +275,20 @@ class DocumentSplitter(ComponentIterator):
226
275
  result[md_field.name.lower()] = section.metadata.page
227
276
  elif md_field == ChunkMetadata.BOUNDING_BOX:
228
277
  result[md_field.name.lower()] = section.metadata.bounding_box
278
+
229
279
  return result
230
280
 
231
281
  def _html_sections(self) -> Iterator[DocumentSection]:
232
282
  """Create DocumentSections reflecting the html-specific separators"""
233
283
  import bs4
284
+
234
285
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
235
286
  emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
236
287
  # current state
237
288
  accumulated_text: list[str] = [] # currently accumulated text
238
289
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
239
290
 
240
- headings: dict[str, str] = {} # current state of observed headings (level -> text)
291
+ headings: dict[str, str] = {} # current state of observed headings (level -> text)
241
292
  sourceline = 0 # most recently seen sourceline
242
293
 
243
294
  def update_metadata(el: bs4.Tag) -> None:
@@ -246,9 +297,9 @@ class DocumentSplitter(ComponentIterator):
246
297
  sourceline = el.sourceline
247
298
  if el.name in _HTML_HEADINGS:
248
299
  # remove the previously seen lower levels
249
- lower_levels = [l for l in headings if l > el.name]
250
- for l in lower_levels:
251
- del headings[l]
300
+ lower_levels = [lv for lv in headings if lv > el.name]
301
+ for lv in lower_levels:
302
+ del headings[lv]
252
303
  headings[el.name] = el.get_text().strip()
253
304
 
254
305
  def emit() -> Iterator[DocumentSection]:
@@ -260,7 +311,7 @@ class DocumentSplitter(ComponentIterator):
260
311
  yield DocumentSection(text=full_text, metadata=md)
261
312
  accumulated_text = []
262
313
 
263
- def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
314
+ def process_element(el: Tag | NavigableString) -> Iterator[DocumentSection]:
264
315
  # process the element and emit sections as necessary
265
316
  nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
266
317
 
@@ -297,7 +348,7 @@ class DocumentSplitter(ComponentIterator):
297
348
  # current state
298
349
  accumulated_text: list[str] = [] # currently accumulated text
299
350
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
300
- headings: dict[str, str] = {} # current state of observed headings (level -> text)
351
+ headings: dict[str, str] = {} # current state of observed headings (level -> text)
301
352
 
302
353
  def update_headings(heading: dict) -> None:
303
354
  # update current state
@@ -307,9 +358,9 @@ class DocumentSplitter(ComponentIterator):
307
358
  level = f'h{lint}'
308
359
  text = heading['children'][0]['raw'].strip()
309
360
  # remove the previously seen lower levels
310
- lower_levels = [l for l in headings.keys() if l > level]
311
- for l in lower_levels:
312
- del headings[l]
361
+ lower_levels = [lv for lv in headings if lv > level]
362
+ for lv in lower_levels:
363
+ del headings[lv]
313
364
  headings[level] = text
314
365
 
315
366
  def emit() -> Iterator[DocumentSection]:
@@ -348,47 +399,48 @@ class DocumentSplitter(ComponentIterator):
348
399
  yield from emit()
349
400
 
350
401
  def _pdf_sections(self) -> Iterator[DocumentSection]:
351
- """Create DocumentSections reflecting the pdf-specific separators"""
352
- import fitz # type: ignore[import-untyped]
353
- doc: fitz.Document = self._doc_handle.pdf_doc
354
- assert doc is not None
402
+ if Separator.PARAGRAPH in self._separators:
403
+ raise Error(
404
+ 'Paragraph splitting is not currently supported for PDF documents. Please contact'
405
+ ' us at https://github.com/pixeltable/pixeltable/issues if you need this feature.'
406
+ )
355
407
 
356
- emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
357
- emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
408
+ doc: PdfDocument = self._doc_handle.pdf_doc
409
+ assert isinstance(doc, PdfDocument)
358
410
 
359
- accumulated_text = [] # invariant: all elements are ftfy clean and non-empty
411
+ emit_on_page = Separator.PAGE in self._separators
412
+ accumulated_text: list[str] = []
360
413
 
361
- def _add_cleaned_text(raw_text: str) -> None:
362
- fixed = ftfy.fix_text(raw_text)
414
+ def _add_cleaned(raw: str) -> None:
415
+ fixed = ftfy.fix_text(raw)
363
416
  if fixed:
364
417
  accumulated_text.append(fixed)
365
418
 
366
419
  def _emit_text() -> str:
367
- full_text = ''.join(accumulated_text)
420
+ txt = ''.join(accumulated_text)
368
421
  accumulated_text.clear()
369
- return full_text
370
-
371
- for page_number, page in enumerate(doc.pages()):
372
- for block in page.get_text('blocks'):
373
- # there is no concept of paragraph in pdf, block is the closest thing
374
- # we can get (eg a paragraph in text may cut across pages)
375
- # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
376
- # other libraries like pdfminer also lack an explicit paragraph concept
377
- x1, y1, x2, y2, text, _, _ = block
378
- _add_cleaned_text(text)
379
- if accumulated_text and emit_on_paragraph:
380
- bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
381
- metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
382
- yield DocumentSection(text=_emit_text(), metadata=metadata)
383
-
384
- if accumulated_text and emit_on_page and not emit_on_paragraph:
385
- yield DocumentSection(text=_emit_text(),
386
- metadata=DocumentSectionMetadata(page=page_number))
387
- accumulated_text = []
422
+ return txt
423
+
424
+ for page_idx, page in enumerate(doc):
425
+ img = page.render().to_pil() if Element.IMAGE in self._elements else None
426
+ text = page.get_textpage().get_text_bounded()
427
+ _add_cleaned(text)
428
+ if accumulated_text and emit_on_page:
429
+ md = DocumentSectionMetadata(page=page_idx)
430
+ yield DocumentSection(text=_emit_text(), image=img, metadata=md)
388
431
 
389
432
  if accumulated_text and not emit_on_page:
390
433
  yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
391
434
 
435
+ def _txt_sections(self) -> Iterator[DocumentSection]:
436
+ """Create DocumentSections for text files.
437
+
438
+ Currently, it returns the entire text as a single section.
439
+ TODO: Add support for paragraphs.
440
+ """
441
+ assert self._doc_handle.txt_doc is not None
442
+ yield DocumentSection(text=ftfy.fix_text(self._doc_handle.txt_doc), metadata=DocumentSectionMetadata())
443
+
392
444
  def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
393
445
  """Split the input sections into sentences"""
394
446
  for section in input_sections:
@@ -399,6 +451,7 @@ class DocumentSplitter(ComponentIterator):
399
451
 
400
452
  def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
401
453
  import tiktoken
454
+
402
455
  if self._tiktoken_target_model is not None:
403
456
  encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
404
457
  else:
@@ -442,5 +495,9 @@ class DocumentSplitter(ComponentIterator):
442
495
  def close(self) -> None:
443
496
  pass
444
497
 
445
- def set_pos(self, pos: int) -> None:
446
- pass
498
+ @classmethod
499
+ @deprecated(
500
+ 'create() is deprecated; use `pixeltable.functions.document.document_splitter` instead', version='0.5.6'
501
+ )
502
+ def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
503
+ return super()._create(**kwargs)
@@ -1,6 +1,7 @@
1
1
  from typing import Any, Sequence
2
2
 
3
3
  import PIL.Image
4
+ from deprecated import deprecated
4
5
 
5
6
  import pixeltable.exceptions as excs
6
7
  import pixeltable.type_system as ts
@@ -8,18 +9,6 @@ from pixeltable.iterators.base import ComponentIterator
8
9
 
9
10
 
10
11
  class TileIterator(ComponentIterator):
11
- """
12
- Iterator over tiles of an image. Each image will be divided into tiles of size `tile_size`, and the tiles will be
13
- iterated over in row-major order (left-to-right, then top-to-bottom). An optional `overlap` parameter may be
14
- specified. If the tiles do not exactly cover the image, then the rightmost and bottommost tiles will be padded with
15
- blackspace, so that the output images all have the exact size `tile_size`.
16
-
17
- Args:
18
- image: Image to split into tiles.
19
- tile_size: Size of each tile, as a pair of integers `[width, height]`.
20
- overlap: Amount of overlap between adjacent tiles, as a pair of integers `[width, height]`.
21
- """
22
-
23
12
  __image: PIL.Image.Image
24
13
  __tile_size: Sequence[int]
25
14
  __overlap: Sequence[int]
@@ -30,15 +19,8 @@ class TileIterator(ComponentIterator):
30
19
  __i: int
31
20
  __j: int
32
21
 
33
- def __init__(
34
- self,
35
- image: PIL.Image.Image,
36
- *,
37
- tile_size: tuple[int, int],
38
- overlap: tuple[int, int] = (0, 0),
39
- ):
40
- if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
41
- raise excs.Error(f"overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}")
22
+ def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
23
+ assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
42
24
 
43
25
  self.__image = image
44
26
  self.__image.load()
@@ -64,11 +46,7 @@ class TileIterator(ComponentIterator):
64
46
  x2 = x1 + self.__tile_size[0]
65
47
  y2 = y1 + self.__tile_size[1]
66
48
  tile = self.__image.crop((x1, y1, x2, y2))
67
- result = {
68
- 'tile': tile,
69
- 'tile_coord': [self.__i, self.__j],
70
- 'tile_box': [x1, y1, x2, y2]
71
- }
49
+ result = {'tile': tile, 'tile_coord': [self.__i, self.__j], 'tile_box': [x1, y1, x2, y2]}
72
50
 
73
51
  self.__i += 1
74
52
  if self.__i >= self.__xlen:
@@ -79,22 +57,23 @@ class TileIterator(ComponentIterator):
79
57
  def close(self) -> None:
80
58
  pass
81
59
 
82
- def set_pos(self, pos: int) -> None:
60
+ def set_pos(self, pos: int, **kwargs: Any) -> None:
83
61
  self.__j = pos // self.__xlen
84
62
  self.__i = pos % self.__xlen
85
63
 
86
64
  @classmethod
87
65
  def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
88
- return {
89
- 'image': ts.ImageType(),
90
- 'tile_size': ts.JsonType(),
91
- 'overlap': ts.JsonType(),
92
- }
66
+ return {'image': ts.ImageType(), 'tile_size': ts.JsonType(), 'overlap': ts.JsonType()}
67
+
68
+ @classmethod
69
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
70
+ tile_size = kwargs.get('tile_size')
71
+ overlap = kwargs.get('overlap', (0, 0))
72
+ if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
73
+ raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
74
+ return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
93
75
 
94
76
  @classmethod
95
- def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
96
- return {
97
- 'tile': ts.ImageType(),
98
- 'tile_coord': ts.JsonType(),
99
- 'tile_box': ts.JsonType(),
100
- }, ['tile']
77
+ @deprecated('create() is deprecated; use `pixeltable.functions.image.tile_iterator` instead', version='0.5.6')
78
+ def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
79
+ return super()._create(**kwargs)
@@ -1,13 +1,17 @@
1
- from typing import Iterator, Any
1
+ from typing import Any, Iterator
2
2
 
3
- import pixeltable.exceptions as excs
4
- import pixeltable.type_system as ts
3
+ from deprecated import deprecated
4
+
5
+ from pixeltable import exceptions as excs, type_system as ts
5
6
  from pixeltable.env import Env
6
7
  from pixeltable.iterators.base import ComponentIterator
7
8
 
8
9
 
9
10
  class StringSplitter(ComponentIterator):
10
- # TODO(aaron-siegel): Merge this with `DocumentSplitter` in order to provide additional capabilities.
11
+ _text: str
12
+ doc: Any # spacy doc
13
+ iter: Iterator[dict[str, Any]]
14
+
11
15
  def __init__(self, text: str, *, separators: str):
12
16
  if separators != 'sentence':
13
17
  raise excs.Error('Only `sentence` separators are currently supported.')
@@ -25,16 +29,15 @@ class StringSplitter(ComponentIterator):
25
29
  def close(self) -> None:
26
30
  pass
27
31
 
28
- def set_pos(self, pos: int) -> None:
29
- pass
30
-
31
32
  @classmethod
32
33
  def input_schema(cls, *args: Any, **kwargs: Any) -> dict[str, ts.ColumnType]:
33
- return {
34
- 'text': ts.StringType(),
35
- 'separators': ts.StringType(),
36
- }
34
+ return {'text': ts.StringType(), 'separators': ts.StringType()}
37
35
 
38
36
  @classmethod
39
- def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
37
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
40
38
  return {'text': ts.StringType()}, []
39
+
40
+ @classmethod
41
+ @deprecated('create() is deprecated; use `pixeltable.functions.string.string_splitter` instead', version='0.5.6')
42
+ def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
43
+ return super()._create(**kwargs)