pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (110) hide show
  1. pixeltable/__init__.py +20 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +23 -7
  4. pixeltable/catalog/insertable_table.py +32 -19
  5. pixeltable/catalog/table.py +210 -20
  6. pixeltable/catalog/table_version.py +272 -111
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/dataframe.py +184 -110
  9. pixeltable/datatransfer/__init__.py +1 -0
  10. pixeltable/datatransfer/label_studio.py +526 -0
  11. pixeltable/datatransfer/remote.py +113 -0
  12. pixeltable/env.py +213 -79
  13. pixeltable/exec/__init__.py +2 -1
  14. pixeltable/exec/data_row_batch.py +6 -7
  15. pixeltable/exec/expr_eval_node.py +28 -28
  16. pixeltable/exec/sql_scan_node.py +7 -6
  17. pixeltable/exprs/__init__.py +4 -3
  18. pixeltable/exprs/column_ref.py +11 -2
  19. pixeltable/exprs/comparison.py +39 -1
  20. pixeltable/exprs/data_row.py +7 -0
  21. pixeltable/exprs/expr.py +26 -19
  22. pixeltable/exprs/function_call.py +17 -18
  23. pixeltable/exprs/globals.py +14 -2
  24. pixeltable/exprs/image_member_access.py +9 -28
  25. pixeltable/exprs/in_predicate.py +96 -0
  26. pixeltable/exprs/inline_array.py +13 -11
  27. pixeltable/exprs/inline_dict.py +15 -13
  28. pixeltable/exprs/row_builder.py +7 -1
  29. pixeltable/exprs/similarity_expr.py +67 -0
  30. pixeltable/ext/functions/whisperx.py +30 -0
  31. pixeltable/ext/functions/yolox.py +16 -0
  32. pixeltable/func/__init__.py +0 -2
  33. pixeltable/func/aggregate_function.py +5 -2
  34. pixeltable/func/callable_function.py +57 -13
  35. pixeltable/func/expr_template_function.py +14 -3
  36. pixeltable/func/function.py +35 -4
  37. pixeltable/func/signature.py +5 -15
  38. pixeltable/func/udf.py +8 -12
  39. pixeltable/functions/fireworks.py +9 -4
  40. pixeltable/functions/huggingface.py +48 -5
  41. pixeltable/functions/openai.py +49 -11
  42. pixeltable/functions/pil/image.py +61 -64
  43. pixeltable/functions/together.py +32 -6
  44. pixeltable/functions/util.py +0 -43
  45. pixeltable/functions/video.py +46 -8
  46. pixeltable/globals.py +443 -0
  47. pixeltable/index/__init__.py +1 -0
  48. pixeltable/index/base.py +9 -2
  49. pixeltable/index/btree.py +54 -0
  50. pixeltable/index/embedding_index.py +91 -15
  51. pixeltable/io/__init__.py +4 -0
  52. pixeltable/io/globals.py +59 -0
  53. pixeltable/{utils → io}/hf_datasets.py +48 -17
  54. pixeltable/io/pandas.py +148 -0
  55. pixeltable/{utils → io}/parquet.py +58 -33
  56. pixeltable/iterators/__init__.py +1 -1
  57. pixeltable/iterators/base.py +8 -4
  58. pixeltable/iterators/document.py +225 -93
  59. pixeltable/iterators/video.py +16 -9
  60. pixeltable/metadata/__init__.py +8 -4
  61. pixeltable/metadata/converters/convert_12.py +3 -0
  62. pixeltable/metadata/converters/convert_13.py +41 -0
  63. pixeltable/metadata/converters/convert_14.py +13 -0
  64. pixeltable/metadata/converters/convert_15.py +29 -0
  65. pixeltable/metadata/converters/util.py +63 -0
  66. pixeltable/metadata/schema.py +12 -6
  67. pixeltable/plan.py +11 -24
  68. pixeltable/store.py +16 -23
  69. pixeltable/tool/create_test_db_dump.py +49 -14
  70. pixeltable/type_system.py +27 -58
  71. pixeltable/utils/coco.py +94 -0
  72. pixeltable/utils/documents.py +42 -12
  73. pixeltable/utils/http_server.py +70 -0
  74. pixeltable-0.2.7.dist-info/METADATA +137 -0
  75. pixeltable-0.2.7.dist-info/RECORD +126 -0
  76. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
  77. pixeltable/client.py +0 -600
  78. pixeltable/exprs/image_similarity_predicate.py +0 -58
  79. pixeltable/func/batched_function.py +0 -53
  80. pixeltable/func/nos_function.py +0 -202
  81. pixeltable/tests/conftest.py +0 -171
  82. pixeltable/tests/ext/test_yolox.py +0 -21
  83. pixeltable/tests/functions/test_fireworks.py +0 -43
  84. pixeltable/tests/functions/test_functions.py +0 -60
  85. pixeltable/tests/functions/test_huggingface.py +0 -158
  86. pixeltable/tests/functions/test_openai.py +0 -162
  87. pixeltable/tests/functions/test_together.py +0 -112
  88. pixeltable/tests/test_audio.py +0 -65
  89. pixeltable/tests/test_catalog.py +0 -27
  90. pixeltable/tests/test_client.py +0 -21
  91. pixeltable/tests/test_component_view.py +0 -379
  92. pixeltable/tests/test_dataframe.py +0 -440
  93. pixeltable/tests/test_dirs.py +0 -107
  94. pixeltable/tests/test_document.py +0 -120
  95. pixeltable/tests/test_exprs.py +0 -802
  96. pixeltable/tests/test_function.py +0 -332
  97. pixeltable/tests/test_index.py +0 -138
  98. pixeltable/tests/test_migration.py +0 -44
  99. pixeltable/tests/test_nos.py +0 -54
  100. pixeltable/tests/test_snapshot.py +0 -231
  101. pixeltable/tests/test_table.py +0 -1343
  102. pixeltable/tests/test_transactional_directory.py +0 -42
  103. pixeltable/tests/test_types.py +0 -52
  104. pixeltable/tests/test_video.py +0 -159
  105. pixeltable/tests/test_view.py +0 -535
  106. pixeltable/tests/utils.py +0 -442
  107. pixeltable/utils/clip.py +0 -18
  108. pixeltable-0.2.5.dist-info/METADATA +0 -128
  109. pixeltable-0.2.5.dist-info/RECORD +0 -139
  110. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
@@ -1,23 +1,25 @@
1
- from typing import Dict, Any, List, Tuple, Generator, Optional, Iterable
2
- import logging
3
1
  import dataclasses
4
2
  import enum
3
+ import logging
4
+ from typing import Dict, Any, List, Tuple, Optional, Iterable, Iterator
5
5
 
6
- from .base import ComponentIterator
6
+ import ftfy
7
7
 
8
- from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
9
- from pixeltable.exceptions import Error
10
8
  from pixeltable.env import Env
9
+ from pixeltable.exceptions import Error
10
+ from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
11
11
  from pixeltable.utils.documents import get_document_handle
12
-
12
+ from .base import ComponentIterator
13
13
 
14
14
  _logger = logging.getLogger('pixeltable')
15
15
 
16
16
 
17
17
  class ChunkMetadata(enum.Enum):
18
18
  TITLE = 1
19
- HEADINGS = 2
19
+ HEADING = 2
20
20
  SOURCELINE = 3
21
+ PAGE = 4
22
+ BOUNDING_BOX = 5
21
23
 
22
24
 
23
25
  class Separator(enum.Enum):
@@ -26,52 +28,115 @@ class Separator(enum.Enum):
26
28
  SENTENCE = 3
27
29
  TOKEN_LIMIT = 4
28
30
  CHAR_LIMIT = 5
31
+ PAGE = 6
29
32
 
30
33
 
31
34
  @dataclasses.dataclass
32
- class DocumentSectionMd:
35
+ class DocumentSectionMetadata:
33
36
  """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
34
- source_line: int
35
-
37
+ # html and markdown metadata
38
+ sourceline: Optional[int] = None
36
39
  # the stack of headings up to the most recently observed one;
37
40
  # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
38
- headings: Dict[int, str]
41
+ heading: Optional[Dict[int, str]] = None
42
+
43
+ # pdf-specific metadata
44
+ page: Optional[int] = None
45
+ # bounding box as an {x1, y1, x2, y2} dictionary
46
+ bounding_box: Optional[Dict[str, float]] = None
39
47
 
40
48
 
41
49
  @dataclasses.dataclass
42
50
  class DocumentSection:
43
51
  """A single document chunk, according to some of the splitting criteria"""
44
52
  text: Optional[str]
45
- md: Optional[DocumentSectionMd]
53
+ metadata: Optional[DocumentSectionMetadata]
54
+
55
+
56
+ def _parse_separators(separators: str) -> List[Separator]:
57
+ ret = []
58
+ for s in separators.split(','):
59
+ clean_s = s.strip().upper()
60
+ if not clean_s:
61
+ continue
62
+ if clean_s not in Separator.__members__:
63
+ raise Error(
64
+ f'Invalid separator: `{s.strip()}`. Valid separators are: {", ".join(Separator.__members__).lower()}'
65
+ )
66
+ ret.append(Separator[clean_s])
67
+ return ret
68
+
69
+
70
+ def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
71
+ ret = []
72
+ for m in metadata.split(','):
73
+ clean_m = m.strip().upper()
74
+ if not clean_m:
75
+ continue
76
+ if clean_m not in ChunkMetadata.__members__:
77
+ raise Error(
78
+ f'Invalid metadata: `{m.strip()}`. Valid metadata are: {", ".join(ChunkMetadata.__members__).lower()}'
79
+ )
80
+ ret.append(ChunkMetadata[clean_m])
81
+ return ret
82
+
83
+
84
+ _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
46
85
 
47
86
 
48
87
  class DocumentSplitter(ComponentIterator):
49
- """"Iterator over pieces of a document"""
50
- MD_COLUMN_TYPES = {
51
- ChunkMetadata.TITLE: StringType(),
52
- ChunkMetadata.HEADINGS: JsonType(),
53
- ChunkMetadata.SOURCELINE: IntType()
88
+ """Iterator over chunks of a document. The document is chunked according to the specified `separators`.
89
+
90
+ The iterator yields a `text` field containing the text of the chunk, and it may also
91
+ include additional metadata fields if specified in the `metadata` parameter, as explained below.
92
+
93
+ Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
94
+ """
95
+ METADATA_COLUMN_TYPES = {
96
+ ChunkMetadata.TITLE: StringType(nullable=True),
97
+ ChunkMetadata.HEADING: JsonType(nullable=True),
98
+ ChunkMetadata.SOURCELINE: IntType(nullable=True),
99
+ ChunkMetadata.PAGE: IntType(nullable=True),
100
+ ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
54
101
  }
55
102
 
56
103
  def __init__(
57
- self, document: str, *, separators: str, limit: int = 0, overlap: int = 0, metadata: str = '',
58
- html_skip_tags: List[str] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
104
+ self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None,
105
+ metadata: str = '',
106
+ html_skip_tags: Optional[list[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
59
107
  tiktoken_target_model: Optional[str] = None
60
108
  ):
61
- import bs4
109
+ """Init method for `DocumentSplitter` class.
110
+
111
+ Args:
112
+ separators: separators to use to chunk the document. Options are:
113
+ `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
114
+ This may be a comma-separated string, e.g., `'heading,token_limit'`.
115
+ limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
116
+ or `'char_limit'` is specified.
117
+ metadata: additional metadata fields to include in the output. Options are:
118
+ `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
119
+ (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
120
+ """
62
121
  if html_skip_tags is None:
63
122
  html_skip_tags = ['nav']
64
- with open(document, 'r', encoding='utf8') as fh:
65
- s = fh.read()
66
- self._doc_handle = get_document_handle(s)
67
- assert self._doc_handle is not None
68
- self._separators = [Separator[s.upper()] for s in separators.split(',')]
69
- self._md_fields = [ChunkMetadata[m.upper()] for m in metadata.split(',')] if len(metadata) > 0 else []
70
- self._doc_title = \
71
- self._doc_handle.bs_doc.title.get_text().strip() if self._doc_handle.bs_doc is not None else ''
72
- self._limit = limit
123
+ self._doc_handle = get_document_handle(document)
124
+ assert self._doc_handle is not None
125
+ # calling the output_schema method to validate the input arguments
126
+ self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
127
+ self._separators = _parse_separators(separators)
128
+ self._metadata_fields = _parse_metadata(metadata)
129
+ if self._doc_handle.bs_doc is not None:
130
+ title = self._doc_handle.bs_doc.title
131
+ if title is None:
132
+ self._doc_title = ''
133
+ else:
134
+ self._doc_title = ftfy.fix_text(title.get_text().strip())
135
+ else:
136
+ self._doc_title = ''
137
+ self._limit = 0 if limit is None else limit
73
138
  self._skip_tags = html_skip_tags
74
- self._overlap = overlap
139
+ self._overlap = 0 if overlap is None else overlap
75
140
  self._tiktoken_encoding = tiktoken_encoding
76
141
  self._tiktoken_target_model = tiktoken_target_model
77
142
 
@@ -79,9 +144,15 @@ class DocumentSplitter(ComponentIterator):
79
144
  if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
80
145
  assert self._doc_handle.bs_doc is not None
81
146
  self._sections = self._html_sections()
82
- else:
147
+ elif self._doc_handle.format == DocumentType.DocumentFormat.MD:
83
148
  assert self._doc_handle.md_ast is not None
84
149
  self._sections = self._markdown_sections()
150
+ elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
151
+ assert self._doc_handle.pdf_doc is not None
152
+ self._sections = self._pdf_sections()
153
+ else:
154
+ assert False, f'unknown document format: {self._doc_handle.format}'
155
+
85
156
  if Separator.SENTENCE in self._separators:
86
157
  self._sections = self._sentence_sections(self._sections)
87
158
  if Separator.TOKEN_LIMIT in self._separators:
@@ -105,38 +176,36 @@ class DocumentSplitter(ComponentIterator):
105
176
  @classmethod
106
177
  def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
107
178
  schema = {'text': StringType()}
108
- if 'metadata' in kwargs and len(kwargs['metadata']) > 0:
109
- md_fields = kwargs['metadata'].split(',')
110
- for md_field in md_fields:
111
- if not hasattr(ChunkMetadata, md_field.upper()):
112
- raise Error(f'Invalid metadata field {md_field}')
113
- schema[md_field.lower()] = cls.MD_COLUMN_TYPES[ChunkMetadata[md_field.upper()]]
179
+ md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
180
+
181
+ for md_field in md_fields:
182
+ schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
114
183
 
115
184
  assert 'separators' in kwargs
116
- separators = kwargs['separators'].split(',')
117
- for separator in separators:
118
- if not hasattr(Separator, separator.upper()):
119
- raise Error(f'Invalid separator {separator}')
185
+ separators = _parse_separators(kwargs['separators'])
120
186
 
121
- # check dependencies
122
- if 'sentence' in separators:
123
- Env.get().require_package('spacy')
124
- if 'token_limit' in separators:
125
- Env.get().require_package('tiktoken')
187
+ limit = kwargs.get('limit')
188
+ overlap = kwargs.get('overlap')
126
189
 
127
- if 'limit' in kwargs or 'overlap' in kwargs:
128
- if 'token_limit' not in separators and 'char_limit' not in separators:
190
+ if limit is not None or overlap is not None:
191
+ if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
129
192
  raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
130
- if 'limit' in kwargs and int(kwargs['limit']) <= 0:
193
+ if limit is not None and limit <= 0:
131
194
  raise Error('"limit" must be an integer > 0')
132
- if 'overlap' in kwargs and int(kwargs['overlap']) < 0:
195
+ if overlap is not None and overlap < 0:
133
196
  raise Error('"overlap" must be an integer >= 0')
134
- if 'token_limit' in separators or 'char_limit' in separators:
135
- if 'token_limit' in separators and 'char_limit' in separators:
197
+ if Separator.TOKEN_LIMIT in separators or Separator.CHAR_LIMIT in separators:
198
+ if Separator.TOKEN_LIMIT in separators and Separator.CHAR_LIMIT in separators:
136
199
  raise Error('Cannot specify both "token_limit" and "char_limit" separators')
137
- if 'limit' not in kwargs:
200
+ if kwargs.get('limit') is None:
138
201
  raise Error('limit is required with "token_limit"/"char_limit" separators')
139
202
 
203
+ # check dependencies at the end
204
+ if Separator.SENTENCE in separators:
205
+ Env.get().require_package('spacy')
206
+ if Separator.TOKEN_LIMIT in separators:
207
+ Env.get().require_package('tiktoken')
208
+
140
209
  return schema, []
141
210
 
142
211
  def __next__(self) -> Dict[str, Any]:
@@ -145,47 +214,55 @@ class DocumentSplitter(ComponentIterator):
145
214
  if section.text is None:
146
215
  continue
147
216
  result = {'text': section.text}
148
- for md_field in self._md_fields:
217
+ for md_field in self._metadata_fields:
149
218
  if md_field == ChunkMetadata.TITLE:
150
219
  result[md_field.name.lower()] = self._doc_title
151
- elif md_field == ChunkMetadata.HEADINGS:
152
- result[md_field.name.lower()] = section.md.headings
220
+ elif md_field == ChunkMetadata.HEADING:
221
+ result[md_field.name.lower()] = section.metadata.heading
153
222
  elif md_field == ChunkMetadata.SOURCELINE:
154
- result[md_field.name.lower()] = section.md.source_line
223
+ result[md_field.name.lower()] = section.metadata.sourceline
224
+ elif md_field == ChunkMetadata.PAGE:
225
+ result[md_field.name.lower()] = section.metadata.page
226
+ elif md_field == ChunkMetadata.BOUNDING_BOX:
227
+ result[md_field.name.lower()] = section.metadata.bounding_box
155
228
  return result
156
229
 
157
- def _html_sections(self) -> Generator[DocumentSection, None, None]:
230
+ def _html_sections(self) -> Iterator[DocumentSection]:
158
231
  """Create DocumentSections reflecting the html-specific separators"""
159
232
  import bs4
160
233
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
161
234
  emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
162
235
  # current state
163
- text_section = '' # currently accumulated text
236
+ accumulated_text = [] # currently accumulated text
237
+ # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
238
+
164
239
  headings: Dict[int, str] = {} # current state of observed headings (level -> text)
165
240
  sourceline = 0 # most recently seen sourceline
166
241
 
167
- def update_md(el: bs4.Tag) -> None:
242
+ def update_metadata(el: bs4.Tag) -> None:
168
243
  # update current state
169
244
  nonlocal headings, sourceline
170
245
  sourceline = el.sourceline
171
- if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
246
+ if el.name in _HTML_HEADINGS:
172
247
  level = int(el.name[1])
173
248
  # remove the previously seen lower levels
174
- lower_levels = [l for l in headings.keys() if l > level]
249
+ lower_levels = [l for l in headings if l > level]
175
250
  for l in lower_levels:
176
251
  del headings[l]
177
252
  headings[level] = el.get_text().strip()
178
253
 
179
254
  def emit() -> None:
180
- nonlocal text_section, headings, sourceline
181
- if len(text_section) > 0:
182
- md = DocumentSectionMd(sourceline, headings.copy())
183
- yield DocumentSection(text=text_section, md=md)
184
- text_section = ''
185
-
186
- def process_element(el: bs4.PageElement) -> Generator[DocumentSection, None, None]:
255
+ nonlocal accumulated_text, headings, sourceline
256
+ if len(accumulated_text) > 0:
257
+ md = DocumentSectionMetadata(sourceline=sourceline, heading=headings.copy())
258
+ full_text = ' '.join(accumulated_text)
259
+ full_text = ftfy.fix_text(full_text)
260
+ yield DocumentSection(text=full_text, metadata=md)
261
+ accumulated_text = []
262
+
263
+ def process_element(el: bs4.PageElement) -> Iterator[DocumentSection]:
187
264
  # process the element and emit sections as necessary
188
- nonlocal text_section, headings, sourceline, emit_on_heading, emit_on_paragraph
265
+ nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
189
266
  if el.name in self._skip_tags:
190
267
  return
191
268
 
@@ -193,30 +270,31 @@ class DocumentSplitter(ComponentIterator):
193
270
  # accumulate text until we see a tag we care about
194
271
  text = el.get_text().strip()
195
272
  if len(text) > 0:
196
- text_section += ' ' + text
273
+ accumulated_text.append(text)
197
274
  return
198
275
 
199
- if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
276
+ if el.name in _HTML_HEADINGS:
200
277
  if emit_on_heading:
201
278
  yield from emit()
202
- update_md(el)
279
+ update_metadata(el)
203
280
  elif el.name == 'p':
204
281
  if emit_on_paragraph:
205
282
  yield from emit()
206
- update_md(el)
283
+ update_metadata(el)
207
284
  for child in el.children:
208
285
  yield from process_element(child)
209
286
 
210
287
  yield from process_element(self._doc_handle.bs_doc)
211
288
  yield from emit()
212
289
 
213
- def _markdown_sections(self) -> Generator[DocumentSection, None, None]:
290
+ def _markdown_sections(self) -> Iterator[DocumentSection]:
214
291
  """Create DocumentSections reflecting the html-specific separators"""
215
292
  assert self._doc_handle.md_ast is not None
216
293
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
217
294
  emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
218
295
  # current state
219
- text_section = '' # currently accumulated text
296
+ accumulated_text = [] # currently accumulated text
297
+ # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
220
298
  headings: Dict[int, str] = {} # current state of observed headings (level -> text)
221
299
 
222
300
  def update_headings(heading: Dict) -> None:
@@ -232,22 +310,22 @@ class DocumentSplitter(ComponentIterator):
232
310
  headings[level] = text
233
311
 
234
312
  def emit() -> None:
235
- nonlocal text_section, headings
236
- if len(text_section) > 0:
237
- md = DocumentSectionMd(0, headings.copy())
238
- yield DocumentSection(text=text_section, md=md)
239
- text_section = ''
313
+ nonlocal accumulated_text, headings
314
+ if len(accumulated_text) > 0:
315
+ metadata = DocumentSectionMetadata(sourceline=0, heading=headings.copy())
316
+ yield DocumentSection(text=ftfy.fix_text(' '.join(accumulated_text)), metadata=metadata)
317
+ accumulated_text = []
240
318
 
241
- def process_element(el: Dict) -> Generator[DocumentSection, None, None]:
319
+ def process_element(el: Dict) -> Iterator[DocumentSection]:
242
320
  # process the element and emit sections as necessary
243
- nonlocal text_section, headings, emit_on_heading, emit_on_paragraph
321
+ nonlocal accumulated_text, headings, emit_on_heading, emit_on_paragraph
244
322
  assert 'type' in el
245
323
 
246
324
  if el['type'] == 'text':
247
325
  # accumulate text until we see a separator element
248
326
  text = el['raw'].strip()
249
327
  if len(text) > 0:
250
- text_section += ' ' + text
328
+ accumulated_text.append(text)
251
329
  return
252
330
 
253
331
  if el['type'] == 'heading':
@@ -266,15 +344,57 @@ class DocumentSplitter(ComponentIterator):
266
344
  yield from process_element(el)
267
345
  yield from emit()
268
346
 
269
- def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
347
+ def _pdf_sections(self) -> Iterator[DocumentSection]:
348
+ """Create DocumentSections reflecting the pdf-specific separators"""
349
+ import fitz
350
+ doc: fitz.Document = self._doc_handle.pdf_doc
351
+ assert doc is not None
352
+
353
+ emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
354
+ emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
355
+
356
+ accumulated_text = [] # invariant: all elements are ftfy clean and non-empty
357
+
358
+ def _add_cleaned_text(raw_text: str) -> None:
359
+ fixed = ftfy.fix_text(raw_text)
360
+ if fixed:
361
+ accumulated_text.append(fixed)
362
+
363
+ def _emit_text() -> str:
364
+ full_text = ''.join(accumulated_text)
365
+ accumulated_text.clear()
366
+ return full_text
367
+
368
+ for page_number, page in enumerate(doc.pages()):
369
+ for block in page.get_text('blocks'):
370
+ # there is no concept of paragraph in pdf, block is the closest thing
371
+ # we can get (eg a paragraph in text may cut across pages)
372
+ # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
373
+ # other libraries like pdfminer also lack an explicit paragraph concept
374
+ x1, y1, x2, y2, text, _, _ = block
375
+ _add_cleaned_text(text)
376
+ if accumulated_text and emit_on_paragraph:
377
+ bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
378
+ metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
379
+ yield DocumentSection(text=_emit_text(), metadata=metadata)
380
+
381
+ if accumulated_text and emit_on_page and not emit_on_paragraph:
382
+ yield DocumentSection(text=_emit_text(),
383
+ metadata=DocumentSectionMetadata(page=page_number))
384
+ accumulated_text = []
385
+
386
+ if accumulated_text and not emit_on_page:
387
+ yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
388
+
389
+ def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
270
390
  """Split the input sections into sentences"""
271
391
  for section in input_sections:
272
392
  if section.text is not None:
273
393
  doc = Env.get().spacy_nlp(section.text)
274
394
  for sent in doc.sents:
275
- yield DocumentSection(text=sent.text, md=section.md)
395
+ yield DocumentSection(text=sent.text, metadata=section.metadata)
276
396
 
277
- def _token_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
397
+ def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
278
398
  import tiktoken
279
399
  if self._tiktoken_target_model is not None:
280
400
  encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
@@ -287,13 +407,25 @@ class DocumentSplitter(ComponentIterator):
287
407
  continue
288
408
  tokens = encoding.encode(section.text)
289
409
  start_idx = 0
410
+ text = None
290
411
  while start_idx < len(tokens):
291
412
  end_idx = min(start_idx + self._limit, len(tokens))
292
- text = encoding.decode(tokens[start_idx:end_idx])
293
- yield DocumentSection(text=text, md=section.md)
294
- start_idx += self._limit - self._overlap
295
-
296
- def _char_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
413
+ while end_idx > start_idx:
414
+ # find a cutoff point that doesn't cut in the middle of utf8 multi-byte sequences
415
+ try:
416
+ # check that the truncated data can be properly decoded
417
+ text = encoding.decode(tokens[start_idx:end_idx], errors='strict')
418
+ break
419
+ except UnicodeDecodeError:
420
+ # we split the token array at a point where the utf8 encoding is broken
421
+ end_idx -= 1
422
+
423
+ assert end_idx > start_idx
424
+ assert text
425
+ yield DocumentSection(text=text, metadata=section.metadata)
426
+ start_idx = max(start_idx + 1, end_idx - self._overlap) # ensure we make progress
427
+
428
+ def _char_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
297
429
  for section in input:
298
430
  if section.text is None:
299
431
  continue
@@ -301,7 +433,7 @@ class DocumentSplitter(ComponentIterator):
301
433
  while start_idx < len(section.text):
302
434
  end_idx = min(start_idx + self._limit, len(section.text))
303
435
  text = section.text[start_idx:end_idx]
304
- yield DocumentSection(text=text, md=section.md)
436
+ yield DocumentSection(text=text, metadata=section.metadata)
305
437
  start_idx += self._limit - self._overlap
306
438
 
307
439
  def close(self) -> None:
@@ -1,21 +1,28 @@
1
- from typing import Dict, Any, List, Tuple
2
- from pathlib import Path
3
- import math
4
1
  import logging
2
+ import math
3
+ from pathlib import Path
4
+ from typing import Dict, Any, List, Tuple
5
5
 
6
- import cv2
7
6
  import PIL.Image
7
+ import cv2
8
8
 
9
- from .base import ComponentIterator
10
-
11
- from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
12
9
  from pixeltable.exceptions import Error
13
-
10
+ from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
11
+ from .base import ComponentIterator
14
12
 
15
13
  _logger = logging.getLogger('pixeltable')
16
14
 
15
+
17
16
  class FrameIterator(ComponentIterator):
18
- def __init__(self, video: str, fps: float = 0.0):
17
+ """Iterator over frames of a video.
18
+
19
+ Args:
20
+ video: URL or file of the video to use for frame extraction
21
+ fps: number of frames to extract per second of video. This may be a fractional value, such as 0.5.
22
+ If set to 0.0, then the native framerate of the video will be used (all frames will be extracted).
23
+ Default: 0.0
24
+ """
25
+ def __init__(self, video: str, *, fps: float = 0.0):
19
26
  video_path = Path(video)
20
27
  assert video_path.exists() and video_path.is_file()
21
28
  self.video_path = video_path
@@ -10,11 +10,11 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 12
13
+ VERSION = 16
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
17
- """Create the systemmetadata record"""
17
+ """Create the system metadata record"""
18
18
  system_md = SystemInfoMd(schema_version=VERSION)
19
19
  record = SystemInfo(md=dataclasses.asdict(system_md))
20
20
  with orm.Session(engine, future=True) as session:
@@ -30,17 +30,21 @@ def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) ->
30
30
  global converter_cbs
31
31
  converter_cbs[version] = cb
32
32
 
33
+ def noop_converter(engine: sql.engine.Engine) -> None:
34
+ # Converter to use when incrementing the schema version, but without any functional changes
35
+ pass
36
+
33
37
  # load all converter modules
34
38
  for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
35
39
  importlib.import_module('pixeltable.metadata.converters.' + modname)
36
40
 
37
41
  def upgrade_md(engine: sql.engine.Engine) -> None:
38
42
  """Upgrade the metadata schema to the current version"""
39
- with orm.Session(engine, future=True) as session:
43
+ with orm.Session(engine) as session:
40
44
  system_info = session.query(SystemInfo).one().md
41
45
  md_version = system_info['schema_version']
42
46
  if md_version == VERSION:
43
- return
47
+ return
44
48
  while md_version < VERSION:
45
49
  if md_version not in converter_cbs:
46
50
  raise RuntimeError(f'No metadata converter for version {md_version}')
@@ -0,0 +1,3 @@
1
+ from pixeltable.metadata import register_converter, noop_converter
2
+
3
+ register_converter(12, noop_converter)
@@ -0,0 +1,41 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ import sqlalchemy as sql
5
+
6
+ from pixeltable.metadata import register_converter
7
+ from pixeltable.metadata.schema import Table
8
+
9
+ _logger = logging.getLogger('pixeltable')
10
+
11
+
12
+ def convert_13(engine: sql.engine.Engine) -> None:
13
+ with engine.begin() as conn:
14
+ for row in conn.execute(sql.select(Table)):
15
+ id = row[0]
16
+ md = row[2]
17
+ updated_md = _update_md(md)
18
+ if updated_md != md:
19
+ _logger.info(f'Updating schema for table: {id}')
20
+ conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_md))
21
+
22
+
23
+ # Traverse the schema dictionary and replace instances of `ExplicitBatchedFunction` with
24
+ # `CallableFunction`. DB versions prior to 14 can't contain serialized batched functions,
25
+ # so this is all we need to do.
26
+ def _update_md(md: Any) -> Any:
27
+ if isinstance(md, dict):
28
+ updated_md = {}
29
+ for k, v in md.items():
30
+ if k == '_classpath' and v == 'pixeltable.func.batched_function.ExplicitBatchedFunction':
31
+ updated_md[k] = 'pixeltable.func.callable_function.CallableFunction'
32
+ else:
33
+ updated_md[k] = _update_md(v)
34
+ return updated_md
35
+ elif isinstance(md, list):
36
+ return [_update_md(v) for v in md]
37
+ else:
38
+ return md
39
+
40
+
41
+ register_converter(13, convert_13)
@@ -0,0 +1,13 @@
1
+ import sqlalchemy as sql
2
+
3
+ from pixeltable.metadata.schema import Table
4
+ from pixeltable.metadata import register_converter
5
+
6
+
7
+ def convert_14(engine: sql.engine.Engine) -> None:
8
+ default_remotes = {'remotes': []}
9
+ with engine.begin() as conn:
10
+ conn.execute(sql.update(Table).where(Table.md['remotes'] == None).values(md=Table.md.concat(default_remotes)))
11
+
12
+
13
+ register_converter(14, convert_14)
@@ -0,0 +1,29 @@
1
+ import uuid
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ def convert_15(engine: sql.engine.Engine) -> None:
10
+ convert_table_md(engine, column_md_updater=update_column_md, remote_md_updater=update_remote_md)
11
+
12
+
13
+ def update_column_md(column_md: dict) -> None:
14
+ column_md['proxy_base'] = None
15
+
16
+
17
+ def update_remote_md(remote_md: dict) -> None:
18
+ remote_md['class'] = f'{remote_md["module"]}.{remote_md["class"]}'
19
+ del remote_md['module']
20
+ if remote_md['class'] == 'pixeltable.datatransfer.remote.MockRemote':
21
+ remote_md['remote_md']['name'] = f'remote_{uuid.uuid4()}'
22
+ elif remote_md['class'] == 'pixeltable.datatransfer.label_studio.LabelStudioProject':
23
+ # 'post' is the media_import_method for legacy LabelStudioProject remotes
24
+ remote_md['remote_md']['media_import_method'] = 'post'
25
+ else:
26
+ assert False, remote_md['class']
27
+
28
+
29
+ register_converter(15, convert_15)