PyPI - pixeltable - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

pixeltable 0.2.4py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (99) hide show

pixeltable/__init__.py +18 -9
pixeltable/__version__.py +3 -0
pixeltable/catalog/column.py +31 -50
pixeltable/catalog/insertable_table.py +7 -6
pixeltable/catalog/table.py +171 -57
pixeltable/catalog/table_version.py +417 -140
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/dataframe.py +239 -121
pixeltable/env.py +82 -16
pixeltable/exec/__init__.py +2 -1
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/data_row_batch.py +6 -7
pixeltable/exec/expr_eval_node.py +28 -28
pixeltable/exec/in_memory_data_node.py +11 -7
pixeltable/exec/sql_scan_node.py +7 -6
pixeltable/exprs/__init__.py +4 -3
pixeltable/exprs/column_ref.py +9 -0
pixeltable/exprs/comparison.py +3 -3
pixeltable/exprs/data_row.py +5 -1
pixeltable/exprs/expr.py +15 -7
pixeltable/exprs/function_call.py +17 -15
pixeltable/exprs/image_member_access.py +9 -28
pixeltable/exprs/in_predicate.py +96 -0
pixeltable/exprs/inline_array.py +13 -11
pixeltable/exprs/inline_dict.py +15 -13
pixeltable/exprs/literal.py +16 -4
pixeltable/exprs/row_builder.py +15 -41
pixeltable/exprs/similarity_expr.py +65 -0
pixeltable/ext/__init__.py +5 -0
pixeltable/ext/functions/yolox.py +92 -0
pixeltable/func/__init__.py +0 -2
pixeltable/func/aggregate_function.py +18 -15
pixeltable/func/callable_function.py +57 -13
pixeltable/func/expr_template_function.py +20 -3
pixeltable/func/function.py +35 -4
pixeltable/func/globals.py +24 -14
pixeltable/func/signature.py +23 -27
pixeltable/func/udf.py +13 -12
pixeltable/functions/__init__.py +8 -8
pixeltable/functions/eval.py +7 -8
pixeltable/functions/huggingface.py +64 -17
pixeltable/functions/openai.py +36 -3
pixeltable/functions/pil/image.py +61 -64
pixeltable/functions/together.py +21 -0
pixeltable/functions/util.py +11 -0
pixeltable/globals.py +425 -0
pixeltable/index/__init__.py +2 -0
pixeltable/index/base.py +51 -0
pixeltable/index/embedding_index.py +168 -0
pixeltable/io/__init__.py +3 -0
pixeltable/{utils → io}/hf_datasets.py +48 -17
pixeltable/io/pandas.py +148 -0
pixeltable/{utils → io}/parquet.py +58 -33
pixeltable/iterators/__init__.py +1 -1
pixeltable/iterators/base.py +4 -0
pixeltable/iterators/document.py +218 -97
pixeltable/iterators/video.py +8 -9
pixeltable/metadata/__init__.py +7 -3
pixeltable/metadata/converters/convert_12.py +3 -0
pixeltable/metadata/converters/convert_13.py +41 -0
pixeltable/metadata/schema.py +45 -22
pixeltable/plan.py +15 -51
pixeltable/store.py +38 -41
pixeltable/tool/create_test_db_dump.py +39 -4
pixeltable/type_system.py +47 -96
pixeltable/utils/documents.py +42 -12
pixeltable/utils/http_server.py +70 -0
{pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/METADATA +14 -10
pixeltable-0.2.6.dist-info/RECORD +119 -0
{pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/WHEEL +1 -1
pixeltable/client.py +0 -604
pixeltable/exprs/image_similarity_predicate.py +0 -58
pixeltable/func/batched_function.py +0 -53
pixeltable/tests/conftest.py +0 -177
pixeltable/tests/functions/test_fireworks.py +0 -42
pixeltable/tests/functions/test_functions.py +0 -60
pixeltable/tests/functions/test_huggingface.py +0 -158
pixeltable/tests/functions/test_openai.py +0 -152
pixeltable/tests/functions/test_together.py +0 -111
pixeltable/tests/test_audio.py +0 -65
pixeltable/tests/test_catalog.py +0 -27
pixeltable/tests/test_client.py +0 -21
pixeltable/tests/test_component_view.py +0 -370
pixeltable/tests/test_dataframe.py +0 -439
pixeltable/tests/test_dirs.py +0 -107
pixeltable/tests/test_document.py +0 -120
pixeltable/tests/test_exprs.py +0 -805
pixeltable/tests/test_function.py +0 -324
pixeltable/tests/test_migration.py +0 -43
pixeltable/tests/test_nos.py +0 -54
pixeltable/tests/test_snapshot.py +0 -208
pixeltable/tests/test_table.py +0 -1267
pixeltable/tests/test_transactional_directory.py +0 -42
pixeltable/tests/test_types.py +0 -22
pixeltable/tests/test_video.py +0 -159
pixeltable/tests/test_view.py +0 -530
pixeltable/tests/utils.py +0 -408
pixeltable-0.2.4.dist-info/RECORD +0 -132
{pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/LICENSE +0 -0

pixeltable/iterators/document.py CHANGED Viewed

@@ -1,24 +1,24 @@
-from typing import Dict, Any, List, Tuple, Generator, Optional, Iterable
-import logging
 import dataclasses
 import enum
+import logging
+from typing import Dict, Any, List, Tuple, Optional, Iterable, Iterator
-from .base import ComponentIterator
+import ftfy
-from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
-from pixeltable.exceptions import Error
 from pixeltable.env import Env
+from pixeltable.exceptions import Error
+from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
 from pixeltable.utils.documents import get_document_handle
+from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
 class ChunkMetadata(enum.Enum):
     TITLE = 1
-    HEADINGS = 2
+    HEADING = 2
     SOURCELINE = 3
+    PAGE = 4
+    BOUNDING_BOX = 5
 class Separator(enum.Enum):
     HEADING = 1
@@ -26,52 +26,106 @@ class Separator(enum.Enum):
     SENTENCE = 3
     TOKEN_LIMIT = 4
     CHAR_LIMIT = 5
+    PAGE = 6
 @dataclasses.dataclass
-class DocumentSectionMd:
+class DocumentSectionMetadata:
     """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
-    source_line: int
+    # html and markdown metadata
+    sourceline: Optional[int] = None
     # the stack of headings up to the most recently observed one;
     # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
-    headings: Dict[int, str]
+    heading: Optional[Dict[int, str]] = None
+    # pdf-specific metadata
+    page: Optional[int] = None
+    # bounding box as an {x1, y1, x2, y2} dictionary
+    bounding_box: Optional[Dict[str, float]] = None
 @dataclasses.dataclass
 class DocumentSection:
     """A single document chunk, according to some of the splitting criteria"""
     text: Optional[str]
-    md: Optional[DocumentSectionMd]
+    metadata: Optional[DocumentSectionMetadata]
+def _parse_separators(separators: str) -> List[Separator]:
+    ret = []
+    for s in separators.split(','):
+        clean_s = s.strip().upper()
+        if not clean_s:
+            continue
+        if clean_s not in Separator.__members__:
+            raise Error(
+                f'Invalid separator: `{s.strip()}`. Valid separators are: {", ".join(Separator.__members__).lower()}'
+            )
+        ret.append(Separator[clean_s])
+    return ret
+def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
+    ret = []
+    for m in metadata.split(','):
+        clean_m = m.strip().upper()
+        if not clean_m:
+            continue
+        if clean_m not in ChunkMetadata.__members__:
+            raise Error(
+                f'Invalid metadata: `{m.strip()}`. Valid metadata are: {", ".join(ChunkMetadata.__members__).lower()}'
+            )
+        ret.append(ChunkMetadata[clean_m])
+    return ret
+_HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
 class DocumentSplitter(ComponentIterator):
-    """"Iterator over pieces of a document"""
-    MD_COLUMN_TYPES = {
-        ChunkMetadata.TITLE: StringType(),
-        ChunkMetadata.HEADINGS: JsonType(),
-        ChunkMetadata.SOURCELINE: IntType()
+    """Iterator over pieces of a document. The document is split into chunks based on the specified separators.
+    The iterator output tuples are of schema {'text': StringType()}, but can include additional metadata fields if specified
+    in the `metadata` argument as explained below.
+    All chunk text is passed through `ftfy.fix_text` to fix up common problems with unicode sequences.
+    Args:
+        `metadata`: which additional metadata fields to include in the output schema:
+             'title', 'heading' (HTML and Markdown), 'sourceline' (HTML), 'page' (PDF), 'bounding_box' (PDF).
+             The input can be a comma-separated string of these values eg. 'title,heading,sourceline'.
+        `separators`: which separators to use to split the document into rows. Options are:
+             'heading', 'paragraph', 'sentence', 'token_limit', 'char_limit', 'page'. As with metadata, this is can be a
+                comma-separated string eg. 'heading, token_limit'.
+        `limit`: the maximum number of tokens or characters in each chunk if 'token_limit' or 'char_limit' is specified.
+    """
+    METADATA_COLUMN_TYPES = {
+        ChunkMetadata.TITLE: StringType(nullable=True),
+        ChunkMetadata.HEADING: JsonType(nullable=True),
+        ChunkMetadata.SOURCELINE: IntType(nullable=True),
+        ChunkMetadata.PAGE: IntType(nullable=True),
+        ChunkMetadata.BOUNDING_BOX: JsonType(nullable=True),
     }
     def __init__(
-            self, document: str, *, separators: str, limit: int = 0, overlap: int = 0, metadata: str = '',
-            html_skip_tags: List[str] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
+            self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None, metadata: str = '',
+            html_skip_tags: Optional[List[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
             tiktoken_target_model: Optional[str] = None
     ):
-        import bs4
         if html_skip_tags is None:
             html_skip_tags = ['nav']
-        with open(document, 'r', encoding='utf8') as fh:
-            s = fh.read()
-            self._doc_handle = get_document_handle(s)
-            assert self._doc_handle is not None
-        self._separators = [Separator[s.upper()] for s in separators.split(',')]
-        self._md_fields = [ChunkMetadata[m.upper()] for m in metadata.split(',')] if len(metadata) > 0 else []
-        self._doc_title = \
-            self._doc_handle.bs_doc.title.get_text().strip() if self._doc_handle.bs_doc is not None else ''
-        self._limit = limit
+        self._doc_handle = get_document_handle(document)
+        assert self._doc_handle is not None
+        # calling the output_schema method to validate the input arguments
+        self.output_schema(separators=separators, metadata=metadata, limit=limit, overlap=overlap)
+        self._separators = _parse_separators(separators)
+        self._metadata_fields = _parse_metadata(metadata)
+        if self._doc_handle.bs_doc is not None:
+            title = self._doc_handle.bs_doc.title
+            if title is None:
+                self._doc_title = ''
+            else:
+                self._doc_title = ftfy.fix_text(title.get_text().strip())
+        else:
+            self._doc_title = ''
+        self._limit = 0 if limit is None else limit
         self._skip_tags = html_skip_tags
-        self._overlap = overlap
+        self._overlap = 0 if overlap is None else overlap
         self._tiktoken_encoding = tiktoken_encoding
         self._tiktoken_target_model = tiktoken_target_model
@@ -79,9 +133,15 @@ class DocumentSplitter(ComponentIterator):
         if self._doc_handle.format == DocumentType.DocumentFormat.HTML:
             assert self._doc_handle.bs_doc is not None
             self._sections = self._html_sections()
-        else:
+        elif self._doc_handle.format == DocumentType.DocumentFormat.MD:
             assert self._doc_handle.md_ast is not None
             self._sections = self._markdown_sections()
+        elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
+            assert self._doc_handle.pdf_doc is not None
+            self._sections = self._pdf_sections()
+        else:
+            assert False, f'unknown document format: {self._doc_handle.format}'
         if Separator.SENTENCE in self._separators:
             self._sections = self._sentence_sections(self._sections)
         if Separator.TOKEN_LIMIT in self._separators:
@@ -105,38 +165,36 @@ class DocumentSplitter(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
         schema = {'text': StringType()}
-        if 'metadata' in kwargs and len(kwargs['metadata']) > 0:
-            md_fields = kwargs['metadata'].split(',')
-            for md_field in md_fields:
-                if not hasattr(ChunkMetadata, md_field.upper()):
-                    raise Error(f'Invalid metadata field {md_field}')
-                schema[md_field.lower()] = cls.MD_COLUMN_TYPES[ChunkMetadata[md_field.upper()]]
+        md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
+        for md_field in md_fields:
+            schema[md_field.name.lower()] = cls.METADATA_COLUMN_TYPES[md_field]
         assert 'separators' in kwargs
-        separators = kwargs['separators'].split(',')
-        for separator in separators:
-            if not hasattr(Separator, separator.upper()):
-                raise Error(f'Invalid separator {separator}')
+        separators = _parse_separators(kwargs['separators'])
-        # check dependencies
-        if 'sentence' in separators:
-            Env.get().require_package('spacy')
-        if 'token_limit' in separators:
-            Env.get().require_package('tiktoken')
+        limit = kwargs.get('limit')
+        overlap = kwargs.get('overlap')
-        if 'limit' in kwargs or 'overlap' in kwargs:
-            if 'token_limit' not in separators and 'char_limit' not in separators:
+        if limit is not None or overlap is not None:
+            if Separator.TOKEN_LIMIT not in separators and Separator.CHAR_LIMIT not in separators:
                 raise Error('limit/overlap requires the "token_limit" or "char_limit" separator')
-            if 'limit' in kwargs and int(kwargs['limit']) <= 0:
+            if limit is not None and limit <= 0:
                 raise Error('"limit" must be an integer > 0')
-            if 'overlap' in kwargs and int(kwargs['overlap']) < 0:
+            if overlap is not None and overlap < 0:
                 raise Error('"overlap" must be an integer >= 0')
-        if 'token_limit' in separators or 'char_limit' in separators:
-            if 'token_limit' in separators and 'char_limit' in separators:
+        if Separator.TOKEN_LIMIT in separators or Separator.CHAR_LIMIT in separators:
+            if Separator.TOKEN_LIMIT in separators and Separator.CHAR_LIMIT in separators:
                 raise Error('Cannot specify both "token_limit" and "char_limit" separators')
-            if 'limit' not in kwargs:
+            if kwargs.get('limit') is None:
                 raise Error('limit is required with "token_limit"/"char_limit" separators')
+        # check dependencies at the end
+        if Separator.SENTENCE in separators:
+            Env.get().require_package('spacy')
+        if Separator.TOKEN_LIMIT in separators:
+            Env.get().require_package('tiktoken')
         return schema, []
     def __next__(self) -> Dict[str, Any]:
@@ -145,47 +203,55 @@ class DocumentSplitter(ComponentIterator):
             if section.text is None:
                 continue
             result = {'text': section.text}
-            for md_field in self._md_fields:
+            for md_field in self._metadata_fields:
                 if md_field == ChunkMetadata.TITLE:
                     result[md_field.name.lower()] = self._doc_title
-                elif md_field == ChunkMetadata.HEADINGS:
-                    result[md_field.name.lower()] = section.md.headings
+                elif md_field == ChunkMetadata.HEADING:
+                    result[md_field.name.lower()] = section.metadata.heading
                 elif md_field == ChunkMetadata.SOURCELINE:
-                    result[md_field.name.lower()] = section.md.source_line
+                    result[md_field.name.lower()] = section.metadata.sourceline
+                elif md_field == ChunkMetadata.PAGE:
+                    result[md_field.name.lower()] = section.metadata.page
+                elif md_field == ChunkMetadata.BOUNDING_BOX:
+                    result[md_field.name.lower()] = section.metadata.bounding_box
             return result
-    def _html_sections(self) -> Generator[DocumentSection, None, None]:
+    def _html_sections(self) -> Iterator[DocumentSection]:
         """Create DocumentSections reflecting the html-specific separators"""
         import bs4
         emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
         emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
         # current state
-        text_section = ''  # currently accumulated text
+        accumulated_text = []  # currently accumulated text
+        # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
         headings: Dict[int, str] = {}   # current state of observed headings (level -> text)
         sourceline = 0  # most recently seen sourceline
-        def update_md(el: bs4.Tag) -> None:
+        def update_metadata(el: bs4.Tag) -> None:
             # update current state
             nonlocal headings, sourceline
             sourceline = el.sourceline
-            if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            if el.name in _HTML_HEADINGS:
                 level = int(el.name[1])
                 # remove the previously seen lower levels
-                lower_levels = [l for l in headings.keys() if l > level]
+                lower_levels = [l for l in headings if l > level]
                 for l in lower_levels:
                     del headings[l]
                 headings[level] = el.get_text().strip()
         def emit() -> None:
-            nonlocal text_section, headings, sourceline
-            if len(text_section) > 0:
-                md = DocumentSectionMd(sourceline, headings.copy())
-                yield DocumentSection(text=text_section, md=md)
-                text_section = ''
-        def process_element(el: bs4.PageElement) -> Generator[DocumentSection, None, None]:
+            nonlocal accumulated_text, headings, sourceline
+            if len(accumulated_text) > 0:
+                md = DocumentSectionMetadata(sourceline=sourceline, heading=headings.copy())
+                full_text = ' '.join(accumulated_text)
+                full_text = ftfy.fix_text(full_text)
+                yield DocumentSection(text=full_text, metadata=md)
+                accumulated_text = []
+        def process_element(el: bs4.PageElement) -> Iterator[DocumentSection]:
             # process the element and emit sections as necessary
-            nonlocal text_section, headings, sourceline, emit_on_heading, emit_on_paragraph
+            nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
             if el.name in self._skip_tags:
                 return
@@ -193,30 +259,31 @@ class DocumentSplitter(ComponentIterator):
                 # accumulate text until we see a tag we care about
                 text = el.get_text().strip()
                 if len(text) > 0:
-                    text_section += ' ' + text
+                    accumulated_text.append(text)
                 return
-            if el.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            if el.name in _HTML_HEADINGS:
                 if emit_on_heading:
                     yield from emit()
-                update_md(el)
+                update_metadata(el)
             elif el.name == 'p':
                 if emit_on_paragraph:
                     yield from emit()
-                update_md(el)
+                update_metadata(el)
             for child in el.children:
                 yield from process_element(child)
         yield from process_element(self._doc_handle.bs_doc)
         yield from emit()
-    def _markdown_sections(self) -> Generator[DocumentSection, None, None]:
+    def _markdown_sections(self) -> Iterator[DocumentSection]:
         """Create DocumentSections reflecting the html-specific separators"""
         assert self._doc_handle.md_ast is not None
         emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
         emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
         # current state
-        text_section = ''  # currently accumulated text
+        accumulated_text = []  # currently accumulated text
+        # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
         headings: Dict[int, str] = {}   # current state of observed headings (level -> text)
         def update_headings(heading: Dict) -> None:
@@ -232,22 +299,22 @@ class DocumentSplitter(ComponentIterator):
             headings[level] = text
         def emit() -> None:
-            nonlocal text_section, headings
-            if len(text_section) > 0:
-                md = DocumentSectionMd(0, headings.copy())
-                yield DocumentSection(text=text_section, md=md)
-                text_section = ''
+            nonlocal accumulated_text, headings
+            if len(accumulated_text) > 0:
+                metadata = DocumentSectionMetadata(sourceline=0, heading=headings.copy())
+                yield DocumentSection(text=ftfy.fix_text(' '.join(accumulated_text)), metadata=metadata)
+                accumulated_text = []
-        def process_element(el: Dict) -> Generator[DocumentSection, None, None]:
+        def process_element(el: Dict) -> Iterator[DocumentSection]:
             # process the element and emit sections as necessary
-            nonlocal text_section, headings, emit_on_heading, emit_on_paragraph
+            nonlocal accumulated_text, headings, emit_on_heading, emit_on_paragraph
             assert 'type' in el
             if el['type'] == 'text':
                 # accumulate text until we see a separator element
                 text = el['raw'].strip()
                 if len(text) > 0:
-                    text_section += ' ' + text
+                    accumulated_text.append(text)
                 return
             if el['type'] == 'heading':
@@ -266,15 +333,57 @@ class DocumentSplitter(ComponentIterator):
             yield from process_element(el)
         yield from emit()
-    def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
+    def _pdf_sections(self) -> Iterator[DocumentSection]:
+        """Create DocumentSections reflecting the pdf-specific separators"""
+        import fitz
+        doc: fitz.Document = self._doc_handle.pdf_doc
+        assert doc is not None
+        emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
+        emit_on_page = Separator.PAGE in self._separators or emit_on_paragraph
+        accumulated_text = []  # invariant: all elements are ftfy clean and non-empty
+        def _add_cleaned_text(raw_text: str) -> None:
+            fixed = ftfy.fix_text(raw_text)
+            if fixed:
+                accumulated_text.append(fixed)
+        def _emit_text() -> str:
+            full_text = ''.join(accumulated_text)
+            accumulated_text.clear()
+            return full_text
+        for page_number, page in enumerate(doc.pages()):
+            for block in page.get_text('blocks'):
+                # there is no concept of paragraph in pdf, block is the closest thing
+                # we can get (eg a paragraph in text may cut across pages)
+                # see pymupdf docs https://pymupdf.readthedocs.io/en/latest/app1.html
+                # other libraries like pdfminer also lack an explicit paragraph concept
+                x1, y1, x2, y2, text, _, _ = block
+                _add_cleaned_text(text)
+                if accumulated_text and emit_on_paragraph:
+                    bbox = {'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}
+                    metadata = DocumentSectionMetadata(page=page_number, bounding_box=bbox)
+                    yield DocumentSection(text=_emit_text(), metadata=metadata)
+            if accumulated_text and emit_on_page and not emit_on_paragraph:
+                yield DocumentSection(text=_emit_text(),
+                                      metadata=DocumentSectionMetadata(page=page_number))
+                accumulated_text = []
+        if accumulated_text and not emit_on_page:
+            yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
+    def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
         """Split the input sections into sentences"""
         for section in input_sections:
             if section.text is not None:
                 doc = Env.get().spacy_nlp(section.text)
                 for sent in doc.sents:
-                    yield DocumentSection(text=sent.text, md=section.md)
+                    yield DocumentSection(text=sent.text, metadata=section.metadata)
-    def _token_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
+    def _token_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
         import tiktoken
         if self._tiktoken_target_model is not None:
             encoding = tiktoken.encoding_for_model(self._tiktoken_target_model)
@@ -287,13 +396,25 @@ class DocumentSplitter(ComponentIterator):
                 continue
             tokens = encoding.encode(section.text)
             start_idx = 0
+            text = None
             while start_idx < len(tokens):
                 end_idx = min(start_idx + self._limit, len(tokens))
-                text = encoding.decode(tokens[start_idx:end_idx])
-                yield DocumentSection(text=text, md=section.md)
-                start_idx += self._limit - self._overlap
-    def _char_chunks(self, input: Iterable[DocumentSection]) -> Generator[DocumentSection, None, None]:
+                while end_idx > start_idx:
+                    # find a cutoff point that doesn't cut in the middle of utf8 multi-byte sequences
+                    try:
+                        # check that the truncated data can be properly decoded
+                        text = encoding.decode(tokens[start_idx:end_idx], errors='strict')
+                        break
+                    except UnicodeDecodeError:
+                        # we split the token array at a point where the utf8 encoding is broken
+                        end_idx -= 1
+                assert end_idx > start_idx
+                assert text
+                yield DocumentSection(text=text, metadata=section.metadata)
+                start_idx = max(start_idx + 1, end_idx - self._overlap)  # ensure we make progress
+    def _char_chunks(self, input: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
         for section in input:
             if section.text is None:
                 continue
@@ -301,7 +422,7 @@ class DocumentSplitter(ComponentIterator):
             while start_idx < len(section.text):
                 end_idx = min(start_idx + self._limit, len(section.text))
                 text = section.text[start_idx:end_idx]
-                yield DocumentSection(text=text, md=section.md)
+                yield DocumentSection(text=text, metadata=section.metadata)
                 start_idx += self._limit - self._overlap
     def close(self) -> None:

pixeltable/iterators/video.py CHANGED Viewed

@@ -1,21 +1,20 @@
-from typing import Dict, Any, List, Tuple
-from pathlib import Path
-import math
 import logging
+import math
+from pathlib import Path
+from typing import Dict, Any, List, Tuple
-import cv2
 import PIL.Image
+import cv2
-from .base import ComponentIterator
-from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
+from pixeltable import exprs
 from pixeltable.exceptions import Error
+from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
+from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
 class FrameIterator(ComponentIterator):
-    def __init__(self, video: str, fps: float = 0.0):
+    def __init__(self, video: str, *, fps: float = 0.0):
         video_path = Path(video)
         assert video_path.exists() and video_path.is_file()
         self.video_path = video_path

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 12
+VERSION = 14
 def create_system_info(engine: sql.engine.Engine) -> None:
@@ -30,17 +30,21 @@ def register_converter(version: int, cb: Callable[[sql.engine.Engine], None]) ->
     global converter_cbs
     converter_cbs[version] = cb
+def noop_converter(engine: sql.engine.Engine) -> None:
+    # Converter to use when incrementing the schema version, but without any functional changes
+    pass
 # load all converter modules
 for _, modname, _ in pkgutil.iter_modules([os.path.dirname(__file__) + '/converters']):
     importlib.import_module('pixeltable.metadata.converters.' + modname)
 def upgrade_md(engine: sql.engine.Engine) -> None:
     """Upgrade the metadata schema to the current version"""
-    with orm.Session(engine, future=True) as session:
+    with orm.Session(engine) as session:
         system_info = session.query(SystemInfo).one().md
         md_version = system_info['schema_version']
         if md_version == VERSION:
-                return
+            return
         while md_version < VERSION:
             if md_version not in converter_cbs:
                 raise RuntimeError(f'No metadata converter for version {md_version}')

pixeltable/metadata/converters/convert_12.py ADDED Viewed

@@ -0,0 +1,3 @@
+from pixeltable.metadata import register_converter, noop_converter
+register_converter(12, noop_converter)

pixeltable/metadata/converters/convert_13.py ADDED Viewed

@@ -0,0 +1,41 @@
+import logging
+from typing import Any
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.schema import Table
+_logger = logging.getLogger('pixeltable')
+def convert_13(engine: sql.engine.Engine) -> None:
+    with engine.begin() as conn:
+        for row in conn.execute(sql.select(Table)):
+            id = row[0]
+            md = row[2]
+            updated_md = _update_md(md)
+            if updated_md != md:
+                _logger.info(f'Updating schema for table: {id}')
+                conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_md))
+# Traverse the schema dictionary and replace instances of `ExplicitBatchedFunction` with
+# `CallableFunction`. DB versions prior to 14 can't contain serialized batched functions,
+# so this is all we need to do.
+def _update_md(md: Any) -> Any:
+    if isinstance(md, dict):
+        updated_md = {}
+        for k, v in md.items():
+            if k == '_classpath' and v == 'pixeltable.func.batched_function.ExplicitBatchedFunction':
+                updated_md[k] = 'pixeltable.func.callable_function.CallableFunction'
+            else:
+                updated_md[k] = _update_md(v)
+        return updated_md
+    elif isinstance(md, list):
+        return [_update_md(v) for v in md]
+    else:
+        return md
+register_converter(13, convert_13)

pixeltable 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.4py3-none-any.whl → 0.2.6py3-none-any.whl