pixeltable 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (99) hide show
  1. pixeltable/__init__.py +18 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +31 -50
  4. pixeltable/catalog/insertable_table.py +7 -6
  5. pixeltable/catalog/table.py +171 -57
  6. pixeltable/catalog/table_version.py +417 -140
  7. pixeltable/catalog/table_version_path.py +2 -2
  8. pixeltable/dataframe.py +239 -121
  9. pixeltable/env.py +82 -16
  10. pixeltable/exec/__init__.py +2 -1
  11. pixeltable/exec/cache_prefetch_node.py +1 -1
  12. pixeltable/exec/data_row_batch.py +6 -7
  13. pixeltable/exec/expr_eval_node.py +28 -28
  14. pixeltable/exec/in_memory_data_node.py +11 -7
  15. pixeltable/exec/sql_scan_node.py +7 -6
  16. pixeltable/exprs/__init__.py +4 -3
  17. pixeltable/exprs/column_ref.py +9 -0
  18. pixeltable/exprs/comparison.py +3 -3
  19. pixeltable/exprs/data_row.py +5 -1
  20. pixeltable/exprs/expr.py +15 -7
  21. pixeltable/exprs/function_call.py +17 -15
  22. pixeltable/exprs/image_member_access.py +9 -28
  23. pixeltable/exprs/in_predicate.py +96 -0
  24. pixeltable/exprs/inline_array.py +13 -11
  25. pixeltable/exprs/inline_dict.py +15 -13
  26. pixeltable/exprs/literal.py +16 -4
  27. pixeltable/exprs/row_builder.py +15 -41
  28. pixeltable/exprs/similarity_expr.py +65 -0
  29. pixeltable/ext/__init__.py +5 -0
  30. pixeltable/ext/functions/yolox.py +92 -0
  31. pixeltable/func/__init__.py +0 -2
  32. pixeltable/func/aggregate_function.py +18 -15
  33. pixeltable/func/callable_function.py +57 -13
  34. pixeltable/func/expr_template_function.py +20 -3
  35. pixeltable/func/function.py +35 -4
  36. pixeltable/func/globals.py +24 -14
  37. pixeltable/func/signature.py +23 -27
  38. pixeltable/func/udf.py +13 -12
  39. pixeltable/functions/__init__.py +8 -8
  40. pixeltable/functions/eval.py +7 -8
  41. pixeltable/functions/huggingface.py +64 -17
  42. pixeltable/functions/openai.py +36 -3
  43. pixeltable/functions/pil/image.py +61 -64
  44. pixeltable/functions/together.py +21 -0
  45. pixeltable/functions/util.py +11 -0
  46. pixeltable/globals.py +425 -0
  47. pixeltable/index/__init__.py +2 -0
  48. pixeltable/index/base.py +51 -0
  49. pixeltable/index/embedding_index.py +168 -0
  50. pixeltable/io/__init__.py +3 -0
  51. pixeltable/{utils → io}/hf_datasets.py +48 -17
  52. pixeltable/io/pandas.py +148 -0
  53. pixeltable/{utils → io}/parquet.py +58 -33
  54. pixeltable/iterators/__init__.py +1 -1
  55. pixeltable/iterators/base.py +4 -0
  56. pixeltable/iterators/document.py +218 -97
  57. pixeltable/iterators/video.py +8 -9
  58. pixeltable/metadata/__init__.py +7 -3
  59. pixeltable/metadata/converters/convert_12.py +3 -0
  60. pixeltable/metadata/converters/convert_13.py +41 -0
  61. pixeltable/metadata/schema.py +45 -22
  62. pixeltable/plan.py +15 -51
  63. pixeltable/store.py +38 -41
  64. pixeltable/tool/create_test_db_dump.py +39 -4
  65. pixeltable/type_system.py +47 -96
  66. pixeltable/utils/documents.py +42 -12
  67. pixeltable/utils/http_server.py +70 -0
  68. {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/METADATA +14 -10
  69. pixeltable-0.2.6.dist-info/RECORD +119 -0
  70. {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/WHEEL +1 -1
  71. pixeltable/client.py +0 -604
  72. pixeltable/exprs/image_similarity_predicate.py +0 -58
  73. pixeltable/func/batched_function.py +0 -53
  74. pixeltable/tests/conftest.py +0 -177
  75. pixeltable/tests/functions/test_fireworks.py +0 -42
  76. pixeltable/tests/functions/test_functions.py +0 -60
  77. pixeltable/tests/functions/test_huggingface.py +0 -158
  78. pixeltable/tests/functions/test_openai.py +0 -152
  79. pixeltable/tests/functions/test_together.py +0 -111
  80. pixeltable/tests/test_audio.py +0 -65
  81. pixeltable/tests/test_catalog.py +0 -27
  82. pixeltable/tests/test_client.py +0 -21
  83. pixeltable/tests/test_component_view.py +0 -370
  84. pixeltable/tests/test_dataframe.py +0 -439
  85. pixeltable/tests/test_dirs.py +0 -107
  86. pixeltable/tests/test_document.py +0 -120
  87. pixeltable/tests/test_exprs.py +0 -805
  88. pixeltable/tests/test_function.py +0 -324
  89. pixeltable/tests/test_migration.py +0 -43
  90. pixeltable/tests/test_nos.py +0 -54
  91. pixeltable/tests/test_snapshot.py +0 -208
  92. pixeltable/tests/test_table.py +0 -1267
  93. pixeltable/tests/test_transactional_directory.py +0 -42
  94. pixeltable/tests/test_types.py +0 -22
  95. pixeltable/tests/test_video.py +0 -159
  96. pixeltable/tests/test_view.py +0 -530
  97. pixeltable/tests/utils.py +0 -408
  98. pixeltable-0.2.4.dist-info/RECORD +0 -132
  99. {pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/LICENSE +0 -0
@@ -1,24 +1,24 @@
1
- from typing import Dict, Any, List, Tuple, Generator, Optional, Iterable
2
- import logging
3
1
  import dataclasses
4
2
  import enum
3
+ import logging
4
+ from typing import Dict, Any, List, Tuple, Optional, Iterable, Iterator
5
5
 
6
- from .base import ComponentIterator
6
+ import ftfy
7
7
 
8
- from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
9
- from pixeltable.exceptions import Error
10
8
  from pixeltable.env import Env
9
+ from pixeltable.exceptions import Error
10
+ from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
11
11
  from pixeltable.utils.documents import get_document_handle
12
-
12
+ from .base import ComponentIterator
13
13
 
14
14
  _logger = logging.getLogger('pixeltable')
15
15
 
16
-
17
16
  class ChunkMetadata(enum.Enum):
18
17
  TITLE = 1
19
- HEADINGS = 2
18
+ HEADING = 2
20
19
  SOURCELINE = 3
21
-
20
+ PAGE = 4
21
+ BOUNDING_BOX = 5
22
22
 
23
23
  class Separator(enum.Enum):
24
24
  HEADING = 1
@@ -26,52 +26,106 @@ class Separator(enum.Enum):
26
26
  SENTENCE = 3
27
27
  TOKEN_LIMIT = 4
28
28
  CHAR_LIMIT = 5
29
-
29
+ PAGE = 6
30
30
 
31
31
  @dataclasses.dataclass
32
- class DocumentSectionMd:
32
+ class DocumentSectionMetadata:
33
33
  """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
34
- source_line: int
35
-
34
+ # html and markdown metadata
35
+ sourceline: Optional[int] = None
36
36
  # the stack of headings up to the most recently observed one;
37
37
  # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
38
- headings: Dict[int, str]
38
+ heading: Optional[Dict[int, str]] = None
39
39
 
40
+ # pdf-specific metadata
41
+ page: Optional[int] = None
42
+ # bounding box as an {x1, y1, x2, y2} dictionary
43
+ bounding_box: Optional[Dict[str, float]] = None
40
44
 
41
45
  @dataclasses.dataclass
42
46
  class DocumentSection:
43
47
  """A single document chunk, according to some of the splitting criteria"""
44
48
  text: Optional[str]
45
- md: Optional[DocumentSectionMd]
46
-
49
+ metadata: Optional[DocumentSectionMetadata]
50
+
51
+
52
+ def _parse_separators(separators: str) -> List[Separator]:
53
+ ret = []
54
+ for s in separators.split(','):
55
+ clean_s = s.strip().upper()
56
+ if not clean_s:
57
+ continue
58
+ if clean_s not in Separator.__members__:
59
+ raise Error(
60
+ f'Invalid separator: `{s.strip()}`. Valid separators are: {", ".join(Separator.__members__).lower()}'
61
+ )
62
+ ret.append(Separator[clean_s])
63
+ return ret
64
+
65
+
66
+ def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
67
+ ret = []
68
+ for m in metadata.split(','):
69
+ clean_m = m.strip().upper()
70
+ if not clean_m:
71
+ continue
72
+ if clean_m not in ChunkMetadata.__members__:
73
+ raise Error(
74
+ f'Invalid metadata: `{m.strip()}`. Valid metadata are: {", ".join(ChunkMetadata.__members__).lower()}'
75
+ )
76
+ ret.append(ChunkMetadata[clean_m])
77
+ return ret
78
+
79
+
80
+ _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
47
81
 
48
82
  class DocumentSplitter(ComponentIterator):
49
- """"Iterator over pieces of a document"""
50
- MD_COLUMN_TYPES = {
51
- ChunkMetadata.TITLE: StringType(),
52
- ChunkMetadata.HEADINGS: JsonType(),
53
- ChunkMetadata.SOURCELINE: IntType()
83
+ """Iterator over pieces of a document. The document is split into chunks based on the specified separators.
84
+ The iterator output tuples are of schema {'text': StringType()}, but can include additional metadata fields if specified
85
+ in the `metadata` argument as explained below.
86
+ All chunk text is passed through `ftfy.fix_text` to fix up common problems with unicode sequences.
87
+
88
+ Args:
89
+ `metadata`: which additional metadata fields to include in the output schema:
90
+ 'title', 'heading' (HTML and Markdown), 'sourceline' (HTML), 'page' (PDF), 'bounding_box' (PDF).
91
+ The input can be a comma-separated string of these values eg. 'title,heading,sourceline'.
92
+ `separators`: which separators to use to split the document into rows. Options are:
93
+ 'heading', 'paragraph', 'sentence', 'token_limit', 'char_limit', 'page'. As with metadata, this is can be a
94
+ comma-separated string eg. 'heading, token_limit'.
95
+ `limit`: the maximum number of tokens or characters in each chunk if 'token_limit' or 'char_limit' is specified.
96
+ """
97
+ METADATA_COLUMN_TYPES = {
98
+ ChunkMetadata.TITLE: StringType(nullable=True),
99
+ ChunkMetadata.HEADING: JsonType(nullable=True),
100
+ ChunkMetadata.SOURCELINE: IntType(nullable=True),
101
+ ChunkMetadata.PAGE: IntType(nullable=True),
102
+ ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
54
103
  }
55
104
 
56
105
  def __init__(
57
- self, document: str, *, separators: str, limit: int = 0, overlap: int = 0, metadata: str = '',
58
- html_skip_tags: List[str] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
106
+ self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None, metadata: str = '',
107
+ html_skip_tags: Optional[List[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
59
108
  tiktoken_target_model: Optional[str] = None
60
109
  ):
61
- import bs4
62
110
  if html_skip_tags is None:
63
111
  html_skip_tags = ['nav']
64
- with open(document, 'r', encoding='utf8') as fh:
65
- s = fh.read()
66
- self._doc_handle = get_document_handle(s)
67
- assert self._doc_handle is not None
68
- self._separators = [Separator[s.upper()] for s in separators.split(',')]
69
- self._md_fields = [ChunkMetadata[m.upper()] for m in metadata.split(',')] if len(metadata) > 0 else []
70
- self._doc_title = \
71
- self._doc_handle.bs_doc.title.get_text().strip() if self._doc_handle.bs_doc is not None else ''
72
- self._limit = limit
112
+ self._doc_handle = get_document_handle(document)
113
+ assert self._doc_handle is not None
114
+ # calling the output_schema method to validate the input arguments
115
+ self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
116
+ self._separators = _parse_separators(separators)
117
+ self._metadata_fields = _parse_metadata(metadata)
118
+ if self._doc_handle.bs_doc is not None:
119
+ title = self._doc_handle.bs_doc.title
120
+ if title is None:
121
+ self._doc_title = ''
122
+ else:
123
+ self._doc_title = ftfy.fix_text(title.get_text().strip())
124
+ else:
125
+ self._doc_title = ''
126
+ self._limit = 0 if limit is None else limit
73
127
  self._skip_tags = html_skip_tags
74
- self._overlap = overlap
128
+ self._overlap = 0 if overlap is None else overlap
75
129
  self._tiktoken_encoding = tiktoken_encoding
76
130
  self._tiktoken_target_model = tiktoken_target_model
77
131
 
@@ -79,9 +133,15 @@ class DocumentSplitter(ComponentIterator):
79
133
  if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
80
134
  assert self._doc_handle.bs_doc is not None
81
135
  self._sections = self._html_sections()
82
- else:
136
+ elif self._doc_handle.format == DocumentType.DocumentFormat.MD:
83
137
  assert self._doc_handle.md_ast is not None
84
138
  self._sections = self._markdown_sections()
139
+ elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
140
+ assert self._doc_handle.pdf_doc is not None
141
+ self._sections = self._pdf_sections()
142
+ else:
143
+ assert False, f'unknown document format: {self._doc_handle.format}'
144
+
85
145
  if Separator.SENTENCE in self._separators:
86
146
  self._sections = self._sentence_sections(self._sections)
87
147
  if Separator.TOKEN_LIMIT in self._separators:
@@ -105,38 +165,36 @@ class DocumentSplitter(ComponentIterator):
105
165
  @classmethod
106
166
  def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
107
167
  schema = {'text': StringType()}
108
- if 'metadata' in kwargs and len(kwargs['metadata']) > 0:
109
- md_fields = kwargs['metadata'].split(',')
110
- for md_field in md_fields:
111
- if not hasattr(ChunkMetadata, md_field.upper()):
112
- raise Error(f'Invalid metadata field {md_field}')
113
- schema[md_field.lower()] = cls.MD_COLUMN_TYPES[ChunkMetadata[md_field.upper()]]
168
+ md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
169
+
170
+ for md_field in md_fields:
171
+ schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
114
172
 
115
173
  assert 'separators' in kwargs
116
- separators = kwargs['separators'].split(',')
117
- for separator in separators:
118
- if not hasattr(Separator, separator.upper()):
119
- raise Error(f'Invalid separator {separator}')
174
+ separators = _parse_separators(kwargs['separators'])
120
175
 
121
- # check dependencies
122
- if 'sentence' in separators:
123
- Env.get().require_package('spacy')
124
- if 'token_limit' in separators:
125
- Env.get().require_package('tiktoken')
176
+ limit = kwargs.get('limit')
177
+ overlap = kwargs.get('overlap')
126
178
 
127
- if 'limit' in kwargs or 'overlap' in kwargs:
128
- if 'token_limit' not in separators and 'char_limit' not in separators:
179
+ if limit is not None or overlap is not None:
180
+ if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
129
181
  raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
130
- if 'limit' in kwargs and int(kwargs['limit']) <= 0:
182
+ if limit is not None and limit <= 0:
131
183
  raise Error('"limit" must be an integer > 0')
132
- if 'overlap' in kwargs and int(kwargs['overlap']) < 0:
184
+ if overlap is not None and overlap < 0:
133
185
  raise Error('"overlap" must be an integer >= 0')
134
- if 'token_limit' in separators or 'char_limit' in separators:
135
- if 'token_limit' in separators and 'char_limit' in separators:
186
+ if Separator.TOKEN_LIMIT in separators or Separator.CHAR_LIMIT in separators:
187
+ if Separator.TOKEN_LIMIT in separators and Separator.CHAR_LIMIT in separators:
136
188
  raise Error('Cannot specify both "token_limit" and "char_limit" separators')
137
- if 'limit' not in kwargs:
189
+ if kwargs.get('limit') is None:
138
190
  raise Error('limit is required with "token_limit"/"char_limit" separators')
139
191
 
192
+ # check dependencies at the end
193
+ if Separator.SENTENCE in separators:
194
+ Env.get().require_package('spacy')
195
+ if Separator.TOKEN_LIMIT in separators:
196
+ Env.get().require_package('tiktoken')
197
+
140
198
  return schema, []
141
199
 
142
200
  def __next__(self) -> Dict[str, Any]:
@@ -145,47 +203,55 @@ class DocumentSplitter(ComponentIterator):
145
203
  if section.text is None:
146
204
  continue
147
205
  result = {'text': section.text}
148
- for md_field in self._md_fields:
206
+ for md_field in self._metadata_fields:
149
207
  if md_field == ChunkMetadata.TITLE:
150
208
  result[md_field.name.lower()] = self._doc_title
151
- elif md_field == ChunkMetadata.HEADINGS:
152
- result[md_field.name.lower()] = section.md.headings
209
+ elif md_field == ChunkMetadata.HEADING:
210
+ result[md_field.name.lower()] = section.metadata.heading
153
211
  elif md_field == ChunkMetadata.SOURCELINE:
154
- result[md_field.name.lower()] = section.md.source_line
212
+ result[md_field.name.lower()] = section.metadata.sourceline
213
+ elif md_field == ChunkMetadata.PAGE:
214
+ result[md_field.name.lower()] = section.metadata.page
215
+ elif md_field == ChunkMetadata.BOUNDING_BOX:
216
+ result[md_field.name.lower()] = section.metadata.bounding_box
155
217
  return result
156
218
 
157
- def _html_sections(self) -> Generator[DocumentSection, None, None]:
219
+ def _html_sections(self) -> Iterator[DocumentSection]:
158
220
  """Create DocumentSections reflecting the html-specific separators"""
159
221
  import bs4
160
222
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
161
223
  emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
162
224
  # current state
163
- text_section = '' # currently accumulated text
225
+ accumulated_text = [] # currently accumulated text
226
+ # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
227
+
164
228
  headings: Dict[int, str] = {} # current state of observed headings (level -> text)
165
229
  sourceline = 0 # most recently seen sourceline
166
230
 
167
- def update_md(el: bs4.Tag) -> None:
231
+ def update_metadata(el: bs4.Tag) -> None:
168
232
  # update current state
169
233
  nonlocal headings, sourceline
170
234
  sourceline = el.sourceline
171
- if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
235
+ if el.name in _HTML_HEADINGS:
172
236
  level = int(el.name[1])
173
237
  # remove the previously seen lower levels
174
- lower_levels = [l for l in headings.keys() if l > level]
238
+ lower_levels = [l for l in headings if l > level]
175
239
  for l in lower_levels:
176
240
  del headings[l]
177
241
  headings[level] = el.get_text().strip()
178
242
 
179
243
  def emit() -> None:
180
- nonlocal text_section, headings, sourceline
181
- if len(text_section) > 0:
182
- md = DocumentSectionMd(sourceline, headings.copy())
183
- yield DocumentSection(text=text_section, md=md)
184
- text_section = ''
185
-
186
- def process_element(el: bs4.PageElement) -> Generator[DocumentSection, None, None]:
244
+ nonlocal accumulated_text, headings, sourceline
245
+ if len(accumulated_text) > 0:
246
+ md = DocumentSectionMetadata(sourceline=sourceline, heading=headings.copy())
247
+ full_text = ' '.join(accumulated_text)
248
+ full_text = ftfy.fix_text(full_text)
249
+ yield DocumentSection(text=full_text, metadata=md)
250
+ accumulated_text = []
251
+
252
+ def process_element(el: bs4.PageElement) -> Iterator[DocumentSection]:
187
253
  # process the element and emit sections as necessary
188
- nonlocal text_section, headings, sourceline, emit_on_heading, emit_on_paragraph
254
+ nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
189
255
  if el.name in self._skip_tags:
190
256
  return
191
257
 
@@ -193,30 +259,31 @@ class DocumentSplitter(ComponentIterator):
193
259
  # accumulate text until we see a tag we care about
194
260
  text = el.get_text().strip()
195
261
  if len(text) > 0:
196
- text_section += ' ' + text
262
+ accumulated_text.append(text)
197
263
  return
198
264
 
199
- if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
265
+ if el.name in _HTML_HEADINGS:
200
266
  if emit_on_heading:
201
267
  yield from emit()
202
- update_md(el)
268
+ update_metadata(el)
203
269
  elif el.name == 'p':
204
270
  if emit_on_paragraph:
205
271
  yield from emit()
206
- update_md(el)
272
+ update_metadata(el)
207
273
  for child in el.children:
208
274
  yield from process_element(child)
209
275
 
210
276
  yield from process_element(self._doc_handle.bs_doc)
211
277
  yield from emit()
212
278
 
213
- def _markdown_sections(self) -> Generator[DocumentSection, None, None]:
279
+ def _markdown_sections(self) -> Iterator[DocumentSection]:
214
280
  """Create DocumentSections reflecting the html-specific separators"""
215
281
  assert self._doc_handle.md_ast is not None
216
282
  emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
217
283
  emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
218
284
  # current state
219
- text_section = '' # currently accumulated text
285
+ accumulated_text = [] # currently accumulated text
286
+ # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
220
287
  headings: Dict[int, str] = {} # current state of observed headings (level -> text)
221
288
 
222
289
  def update_headings(heading: Dict) -> None:
@@ -232,22 +299,22 @@ class DocumentSplitter(ComponentIterator):
232
299
  headings[level] = text
233
300
 
234
301
  def emit() -> None:
235
- nonlocal text_section, headings
236
- if len(text_section) > 0:
237
- md = DocumentSectionMd(0, headings.copy())
238
- yield DocumentSection(text=text_section, md=md)
239
- text_section = ''
302
+ nonlocal accumulated_text, headings
303
+ if len(accumulated_text) > 0:
304
+ metadata = DocumentSectionMetadata(sourceline=0, heading=headings.copy())
305
+ yield DocumentSection(text=ftfy.fix_text(' '.join(accumulated_text)), metadata=metadata)
306
+ accumulated_text = []
240
307
 
241
- def process_element(el: Dict) -> Generator[DocumentSection, None, None]:
308
+ def process_element(el: Dict) -> Iterator[DocumentSection]:
242
309
  # process the element and emit sections as necessary
243
- nonlocal text_section, headings, emit_on_heading, emit_on_paragraph
310
+ nonlocal accumulated_text, headings, emit_on_heading, emit_on_paragraph
244
311
  assert 'type' in el
245
312
 
246
313
  if el['type'] == 'text':
247
314
  # accumulate text until we see a separator element
248
315
  text = el['raw'].strip()
249
316
  if len(text) > 0:
250
- text_section += ' ' + text
317
+ accumulated_text.append(text)
251
318
  return
252
319
 
253
320
  if el['type'] == 'heading':
@@ -266,15 +333,57 @@ class DocumentSplitter(ComponentIterator):
266
333
  yield from process_element(el)
267
334
  yield from emit()
268
335
 
269
- def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
336
+ def _pdf_sections(self) -> Iterator[DocumentSection]:
337
+ """Create DocumentSections reflecting the pdf-specific separators"""
338
+ import fitz
339
+ doc: fitz.Document = self._doc_handle.pdf_doc
340
+ assert doc is not None
341
+
342
+ emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
343
+ emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
344
+
345
+ accumulated_text = [] # invariant: all elements are ftfy clean and non-empty
346
+
347
+ def _add_cleaned_text(raw_text: str) -> None:
348
+ fixed = ftfy.fix_text(raw_text)
349
+ if fixed:
350
+ accumulated_text.append(fixed)
351
+
352
+ def _emit_text() -> str:
353
+ full_text = ''.join(accumulated_text)
354
+ accumulated_text.clear()
355
+ return full_text
356
+
357
+ for page_number, page in enumerate(doc.pages()):
358
+ for block in page.get_text('blocks'):
359
+ # there is no concept of paragraph in pdf, block is the closest thing
360
+ # we can get (eg a paragraph in text may cut across pages)
361
+ # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
362
+ # other libraries like pdfminer also lack an explicit paragraph concept
363
+ x1, y1, x2, y2, text, _, _ = block
364
+ _add_cleaned_text(text)
365
+ if accumulated_text and emit_on_paragraph:
366
+ bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
367
+ metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
368
+ yield DocumentSection(text=_emit_text(), metadata=metadata)
369
+
370
+ if accumulated_text and emit_on_page and not emit_on_paragraph:
371
+ yield DocumentSection(text=_emit_text(),
372
+ metadata=DocumentSectionMetadata(page=page_number))
373
+ accumulated_text = []
374
+
375
+ if accumulated_text and not emit_on_page:
376
+ yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
377
+
378
+ def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
270
379
  """Split the input sections into sentences"""
271
380
  for section in input_sections:
272
381
  if section.text is not None:
273
382
  doc = Env.get().spacy_nlp(section.text)
274
383
  for sent in doc.sents:
275
- yield DocumentSection(text=sent.text, md=section.md)
384
+ yield DocumentSection(text=sent.text, metadata=section.metadata)
276
385
 
277
- def _token_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
386
+ def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
278
387
  import tiktoken
279
388
  if self._tiktoken_target_model is not None:
280
389
  encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
@@ -287,13 +396,25 @@ class DocumentSplitter(ComponentIterator):
287
396
  continue
288
397
  tokens = encoding.encode(section.text)
289
398
  start_idx = 0
399
+ text = None
290
400
  while start_idx < len(tokens):
291
401
  end_idx = min(start_idx + self._limit, len(tokens))
292
- text = encoding.decode(tokens[start_idx:end_idx])
293
- yield DocumentSection(text=text, md=section.md)
294
- start_idx += self._limit - self._overlap
295
-
296
- def _char_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
402
+ while end_idx > start_idx:
403
+ # find a cutoff point that doesn't cut in the middle of utf8 multi-byte sequences
404
+ try:
405
+ # check that the truncated data can be properly decoded
406
+ text = encoding.decode(tokens[start_idx:end_idx], errors='strict')
407
+ break
408
+ except UnicodeDecodeError:
409
+ # we split the token array at a point where the utf8 encoding is broken
410
+ end_idx -= 1
411
+
412
+ assert end_idx > start_idx
413
+ assert text
414
+ yield DocumentSection(text=text, metadata=section.metadata)
415
+ start_idx = max(start_idx + 1, end_idx - self._overlap) # ensure we make progress
416
+
417
+ def _char_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
297
418
  for section in input:
298
419
  if section.text is None:
299
420
  continue
@@ -301,7 +422,7 @@ class DocumentSplitter(ComponentIterator):
301
422
  while start_idx < len(section.text):
302
423
  end_idx = min(start_idx + self._limit, len(section.text))
303
424
  text = section.text[start_idx:end_idx]
304
- yield DocumentSection(text=text, md=section.md)
425
+ yield DocumentSection(text=text, metadata=section.metadata)
305
426
  start_idx += self._limit - self._overlap
306
427
 
307
428
  def close(self) -> None:
@@ -1,21 +1,20 @@
1
- from typing import Dict, Any, List, Tuple
2
- from pathlib import Path
3
- import math
4
1
  import logging
2
+ import math
3
+ from pathlib import Path
4
+ from typing import Dict, Any, List, Tuple
5
5
 
6
- import cv2
7
6
  import PIL.Image
7
+ import cv2
8
8
 
9
- from .base import ComponentIterator
10
-
11
- from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
9
+ from pixeltable import exprs
12
10
  from pixeltable.exceptions import Error
13
-
11
+ from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
12
+ from .base import ComponentIterator
14
13
 
15
14
  _logger = logging.getLogger('pixeltable')
16
15
 
17
16
  class FrameIterator(ComponentIterator):
18
- def __init__(self, video: str, fps: float = 0.0):
17
+ def __init__(self, video: str, *, fps: float = 0.0):
19
18
  video_path = Path(video)
20
19
  assert video_path.exists() and video_path.is_file()
21
20
  self.video_path = video_path
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 12
13
+ VERSION = 14
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -30,17 +30,21 @@ def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) ->
30
30
  global converter_cbs
31
31
  converter_cbs[version] = cb
32
32
 
33
+ def noop_converter(engine: sql.engine.Engine) -> None:
34
+ # Converter to use when incrementing the schema version, but without any functional changes
35
+ pass
36
+
33
37
  # load all converter modules
34
38
  for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
35
39
  importlib.import_module('pixeltable.metadata.converters.' + modname)
36
40
 
37
41
  def upgrade_md(engine: sql.engine.Engine) -> None:
38
42
  """Upgrade the metadata schema to the current version"""
39
- with orm.Session(engine, future=True) as session:
43
+ with orm.Session(engine) as session:
40
44
  system_info = session.query(SystemInfo).one().md
41
45
  md_version = system_info['schema_version']
42
46
  if md_version == VERSION:
43
- return
47
+ return
44
48
  while md_version < VERSION:
45
49
  if md_version not in converter_cbs:
46
50
  raise RuntimeError(f'No metadata converter for version {md_version}')
@@ -0,0 +1,3 @@
1
+ from pixeltable.metadata import register_converter, noop_converter
2
+
3
+ register_converter(12, noop_converter)
@@ -0,0 +1,41 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ import sqlalchemy as sql
5
+
6
+ from pixeltable.metadata import register_converter
7
+ from pixeltable.metadata.schema import Table
8
+
9
+ _logger = logging.getLogger('pixeltable')
10
+
11
+
12
+ def convert_13(engine: sql.engine.Engine) -> None:
13
+ with engine.begin() as conn:
14
+ for row in conn.execute(sql.select(Table)):
15
+ id = row[0]
16
+ md = row[2]
17
+ updated_md = _update_md(md)
18
+ if updated_md != md:
19
+ _logger.info(f'Updating schema for table: {id}')
20
+ conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_md))
21
+
22
+
23
+ # Traverse the schema dictionary and replace instances of `ExplicitBatchedFunction` with
24
+ # `CallableFunction`. DB versions prior to 14 can't contain serialized batched functions,
25
+ # so this is all we need to do.
26
+ def _update_md(md: Any) -> Any:
27
+ if isinstance(md, dict):
28
+ updated_md = {}
29
+ for k, v in md.items():
30
+ if k == '_classpath' and v == 'pixeltable.func.batched_function.ExplicitBatchedFunction':
31
+ updated_md[k] = 'pixeltable.func.callable_function.CallableFunction'
32
+ else:
33
+ updated_md[k] = _update_md(v)
34
+ return updated_md
35
+ elif isinstance(md, list):
36
+ return [_update_md(v) for v in md]
37
+ else:
38
+ return md
39
+
40
+
41
+ register_converter(13, convert_13)