PyPI - playa-pdf - Versions diffs - 0.9.0__cp314-cp314-win_amd64.whl → 0.10.0__cp314-cp314-win_amd64.whl - Mend

playa-pdf 0.9.0__cp314-cp314-win_amd64.whl → 0.10.0__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

5bb3f64a832c8c06474c__mypyc.cp314-win_amd64.pyd +0 -0
playa/_saslprep.cp314-win_amd64.pyd +0 -0
playa/_version.py +2 -2
playa/arcfour.cp314-win_amd64.pyd +0 -0
playa/arcfour.py +3 -5
playa/ccitt.cp314-win_amd64.pyd +0 -0
playa/cmapdb.py +3 -2
playa/content.py +118 -68
playa/data_structures.py +56 -30
playa/document.py +562 -458
playa/font.py +1 -0
playa/image.cp314-win_amd64.pyd +0 -0
playa/interp.py +28 -28
playa/lzw.cp314-win_amd64.pyd +0 -0
playa/miner.cp314-win_amd64.pyd +0 -0
playa/miner.py +14 -13
playa/page.py +27 -58
playa/parser.py +26 -7
playa/pdftypes.py +9 -7
playa/runlength.cp314-win_amd64.pyd +0 -0
playa/security.py +6 -6
playa/structure.py +37 -13
playa/utils.cp314-win_amd64.pyd +0 -0
playa/utils.py +2 -2
playa/xref.cp314-win_amd64.pyd +0 -0
playa/xref.py +94 -57
{playa_pdf-0.9.0.dist-info → playa_pdf-0.10.0.dist-info}/METADATA +117 -103
{playa_pdf-0.9.0.dist-info → playa_pdf-0.10.0.dist-info}/RECORD +31 -26
f3e1e5905acd33c8f397__mypyc.cp314-win_amd64.pyd +0 -0
{playa_pdf-0.9.0.dist-info → playa_pdf-0.10.0.dist-info}/WHEEL +0 -0
{playa_pdf-0.9.0.dist-info → playa_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
{playa_pdf-0.9.0.dist-info → playa_pdf-0.10.0.dist-info}/licenses/LICENSE +0 -0

5bb3f64a832c8c06474c__mypyc.cp314-win_amd64.pyd ADDED Viewed

Binary file

playa/_saslprep.cp314-win_amd64.pyd ADDED Viewed

Binary file

playa/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.9.0'
-__version_tuple__ = version_tuple = (0, 9, 0)
+__version__ = version = '0.10.0'
+__version_tuple__ = version_tuple = (0, 10, 0)
 __commit_id__ = commit_id = None

playa/arcfour.cp314-win_amd64.pyd ADDED Viewed

Binary file

playa/arcfour.py CHANGED Viewed

@@ -4,13 +4,13 @@ This code is in the public domain.
 """
-from typing import Sequence
+from typing import List, Sequence
 class Arcfour:
     def __init__(self, key: Sequence[int]) -> None:
         # because Py3 range is not indexable
-        s = [i for i in range(256)]
+        s: List[int] = [i for i in range(256)]
         j = 0
         klen = len(key)
         for i in range(256):
@@ -23,7 +23,7 @@ class Arcfour:
         (i, j) = (self.i, self.j)
         s = self.s
         r = b""
-        for c in iter(data):
+        for c in data:
             i = (i + 1) % 256
             j = (j + s[i]) % 256
             (s[i], s[j]) = (s[j], s[i])
@@ -31,5 +31,3 @@ class Arcfour:
             r += bytes((c ^ k,))
         (self.i, self.j) = (i, j)
         return r
-    encrypt = decrypt = process

playa/ccitt.cp314-win_amd64.pyd CHANGED Viewed

Binary file

playa/cmapdb.py CHANGED Viewed

@@ -196,11 +196,12 @@ class CMapDB:
         name = name.replace("\0", "")
         filename = "%s.pickle.gz" % name
         pklpath = (CMAP_DIR / filename).resolve()
-        if not pklpath.is_relative_to(CMAP_DIR):
-            raise KeyError(f"Ignoring malicious or malformed CMap {name}")
         try:
+            _ = pklpath.relative_to(CMAP_DIR)
             with gzip.open(pklpath) as gzfile:
                 return pickle.load(gzfile)
+        except ValueError as e:
+            raise KeyError(f"Ignoring malicious or malformed CMap {name}") from e
         except FileNotFoundError as e:
             raise KeyError(f"CMap {name} not found in CMapDB") from e

playa/content.py CHANGED Viewed

@@ -4,6 +4,7 @@ PDF content objects created by the interpreter.
 import itertools
 import logging
+import operator
 from abc import abstractmethod
 from copy import copy
 from dataclasses import dataclass
@@ -17,8 +18,10 @@ from typing import (
     Mapping,
     NamedTuple,
     Sequence,
+    Sized,
     Tuple,
     Union,
+    overload,
 )
 from playa.color import (
@@ -474,26 +477,6 @@ class ImageObject(ContentObject):
 LITERAL_TRANSPARENCY = LIT("Transparency")
-def _extract_mcid_texts(itor: Iterable[ContentObject]) -> Dict[int, List[str]]:
-    """Get text for all MCIDs on a page or in a Form XObject"""
-    mctext: Dict[int, List[str]] = {}
-    for obj in itor:
-        if not isinstance(obj, TextObject):
-            continue
-        mcs = obj.mcs
-        if mcs is None or mcs.mcid is None:
-            continue
-        if "ActualText" in mcs.props:
-            assert isinstance(mcs.props["ActualText"], bytes)
-            chars = decode_text(mcs.props["ActualText"])
-        else:
-            chars = obj.chars
-        # Remove soft hyphens
-        chars = chars.replace("\xad", "")
-        mctext.setdefault(mcs.mcid, []).append(chars)
-    return mctext
 @dataclass
 class XObjectObject(ContentObject):
     """An eXternal Object, in the context of a page.
@@ -555,7 +538,7 @@ class XObjectObject(ContentObject):
         for pos, obj in ContentParser([self.stream], self.doc):
             yield obj
-    def __iter__(self) -> Iterator["ContentObject"]:
+    def __iter__(self) -> Iterator[ContentObject]:
         from playa.interp import LazyInterpreter
         interp = LazyInterpreter(
@@ -625,55 +608,29 @@ class XObjectObject(ContentObject):
         return self._structmap
     @property
-    def marked_content(self) -> Sequence[Union[None, Iterable["ContentObject"]]]:
-        """Mapping of marked content IDs to iterators over content objects.
-        These are the content objects associated with the structural
-        elements in `XObjectObject.structure`.  So, for instance, you can do:
+    def marked_content(self) -> "ContentSequence":
+        """A [`ContentSequence`][playa.content.ContentSequence] containing
+        content objects associated with the structural elements in
+        [`structure`][playa.content.XObjectObject.structure].  They
+        consist of a sequence with the same indices (these are the
+        marked content IDs) as the structure so can be zipped:
             for element, contents in zip(xobj.structure,
                                          xobj.marked_content):
-                if element is not None:
-                    if contents is not None:
-                        for obj in contents:
-                            ...  # do something with it
+                for obj in contents:
+                    ...  # do something with it
         Or you can also access the contents of a single element:
-            if xobj.marked_content[mcid] is not None:
-                for obj in xobj.marked_content[mcid]:
-                    ... # do something with it
+            for obj in xobj.marked_content[mcid]:
+                ... # do something with it
-        Why do you have to check if it's `None`?  Because the values
-        are not necessarily sequences (they may just be positions in
-        the content stream), it isn't possible to know if they are
-        empty without iterating over them, which you may or may not
-        want to do, because you are Lazy.
         """
-        from playa.interp import _make_contentmap
         if hasattr(self, "_marked_contents"):
             return self._marked_contents
-        self._marked_contents: Sequence[Union[None, Iterable["ContentObject"]]] = (
-            _make_contentmap(self)
-        )
+        self._marked_contents: ContentSequence = ContentSequence(self)
         return self._marked_contents
-    @property
-    def mcid_texts(self) -> Mapping[int, List[str]]:
-        """Mapping of marked content IDs to Unicode text strings.
-        For use in text extraction from tagged PDFs.
-        Danger: Do not rely on this being a `dict`.
-            Currently this is implemented eagerly, but in the future it
-            may return a lazy object.
-        """
-        if hasattr(self, "_textmap"):
-            return self._textmap
-        self._textmap: Mapping[int, List[str]] = _extract_mcid_texts(self)
-        return self._textmap
     @property
     def fonts(self) -> Mapping[str, Font]:
         """Mapping of resource names to fonts for this Form XObject.
@@ -687,19 +644,14 @@ class XObjectObject(ContentObject):
             generally considered to be globally unique, it may be
             possible to access fonts by them in the future.
-        Danger: Do not rely on this being a `dict`.
-            Currently this is implemented eagerly, but in the future it
-            may return a lazy object which only loads fonts on demand.
         """
-        from playa.interp import _make_fontmap
+        from playa.interp import FontMapping
         if hasattr(self, "_fontmap"):
             return self._fontmap
-        if self.resources is None or "Font" not in self.resources:
-            self._fontmap: Dict[str, Font] = {}
-        else:
-            self._fontmap = _make_fontmap(self.resources["Font"], self.doc)
+        self._fontmap: Mapping[str, Font] = FontMapping(
+            self.resources.get("Font") if self.resources else None, self.doc
+        )
         return self._fontmap
     @classmethod
@@ -806,7 +758,14 @@ class TextBase(ContentObject):
     @property
     @abstractmethod
-    def matrix(self) -> Matrix: ...
+    def matrix(self) -> Matrix:
+        """Rendering matrix `T_rm`, which transforms text space coordinates to
+        device space (PDF 2.0 section 9.4.4)."""
+    @property
+    @abstractmethod
+    def displacement(self) -> Point:
+        """Vector to the origin of the next glyph in device space."""
     @property
     def font(self) -> Font:
@@ -1298,3 +1257,94 @@ class TextObject(TextBase):
                 continue
             nglyphs += sum(1 for _ in font.decode(obj))
         return nglyphs
+class ContentSection(Iterable[ContentObject], Sized):
+    """Sequence of content objects in a marked content section.
+    This is a `Sized` collection so that you can quickly check if it
+    is non-empty by its truth value.  The actual length may or may not
+    be relevant.
+    """
+    def __init__(self, objs: Iterable[ContentObject]) -> None:
+        self._objs = [obj.finalize() for obj in objs]
+        self._texts: Union[List[str], None] = None
+    def __len__(self) -> int:
+        return len(self._objs)
+    def __iter__(self) -> Iterator[ContentObject]:
+        return iter(self._objs)
+    @property
+    def texts(self) -> Sequence[str]:
+        """Sequence of text strings for a marked content section."""
+        if self._texts is not None:
+            return self._texts
+        self._texts = []
+        for obj in self._objs:
+            if not isinstance(obj, TextObject):
+                continue
+            mcs = obj.mcs
+            if mcs is None or mcs.mcid is None:
+                continue
+            if "ActualText" in mcs.props:
+                assert isinstance(mcs.props["ActualText"], bytes)
+                chars = decode_text(mcs.props["ActualText"])
+            else:
+                chars = obj.chars
+            # Remove soft hyphens
+            chars = chars.replace("\xad", "")
+            self._texts.append(chars)
+        return self._texts
+class ContentSequence(Sequence[ContentSection]):
+    """Collect content object in marked content sections.
+    These are organized in a sequence and ordered by marked content
+    ID, because this is the definition of "logical content order" and
+    also defines the reading order of text.
+    You can also get them as an iterator in "page content order",
+    i.e. the order in which they appeared in the actual content
+    stream, using the `page_order` property.
+    """
+    def __init__(self, streamer: Iterable[ContentObject]) -> None:
+        self._contents: Dict[int, ContentSection] = {}
+        self._maxid: int = 0
+        for mcid, objs in itertools.groupby(streamer, operator.attrgetter("mcid")):
+            if mcid is None:
+                continue
+            # Python dicts preserve insertion order, but if there are
+            # duplicate marked content sections (this is forbidden by
+            # the spec, but.....) we can't do page content order
+            self._contents[mcid] = ContentSection(objs)
+            self._maxid = max(self._maxid, mcid)
+    def __len__(self) -> int:
+        return self._maxid + 1
+    @property
+    def page_order(self) -> Iterator[ContentSection]:
+        """Marked content sections in page content order."""
+        yield from self._contents.values()
+    @overload
+    def __getitem__(self, mcid: int) -> ContentSection: ...
+    @overload
+    def __getitem__(self, mcid: slice) -> Sequence[ContentSection]: ...
+    def __getitem__(
+        self, mcid: Union[int, slice]
+    ) -> Union[ContentSection, Sequence[ContentSection]]:
+        if isinstance(mcid, slice):
+            return [self[idx] for idx in range(mcid.start, mcid.stop, mcid.step)]
+        else:
+            if mcid > self._maxid:
+                raise IndexError(f"Marked content ID {mcid} out of range")
+            return self._contents.get(mcid, ContentSection([]))

playa/data_structures.py CHANGED Viewed

@@ -1,12 +1,16 @@
-from typing import Any, Dict, Iterator, Tuple, Union
+from typing import Dict, Iterator, Mapping, Tuple, Union, ItemsView
-from playa.pdftypes import dict_value, int_value, list_value, str_value
+from playa.pdftypes import PDFObject, dict_value, int_value, list_value, str_value
 from playa.utils import choplist
+# TODO: NameTree and NumberTree are nearly identical and should be
+# refactored to a single base class.
 def walk_number_tree(
-    tree: Dict[str, Any], key: Union[int, None] = None
-) -> Iterator[Tuple[int, Any]]:
+    tree: Dict[str, PDFObject], key: Union[int, None] = None
+) -> Iterator[Tuple[int, PDFObject]]:
     stack = [tree]
     while stack:
         item = dict_value(stack.pop())
@@ -21,34 +25,45 @@ def walk_number_tree(
             stack.extend(reversed(list_value(item["Kids"])))
-class NumberTree:
+class NumberTreeItemsView(ItemsView[int, PDFObject]):
+    _mapping: "NumberTree"
+    def __iter__(self) -> Iterator[Tuple[int, PDFObject]]:
+        yield from walk_number_tree(self._mapping._obj)
+class NumberTree(Mapping[int, PDFObject]):
     """A PDF number tree.
     See Section 7.9.7 of the PDF 1.7 Reference.
+    Raises:
+        TypeError: If initialized with a non-dictionary.
     """
-    def __init__(self, obj: Any):
+    def __init__(self, obj: PDFObject):
         self._obj = dict_value(obj)
-    def __iter__(self) -> Iterator[Tuple[int, Any]]:
-        return walk_number_tree(self._obj)
+    def __len__(self) -> int:
+        return sum(1 for _ in self)
-    def __contains__(self, num: int) -> bool:
-        for idx, _ in walk_number_tree(self._obj, num):
-            if idx == num:
-                return True
-        return False
+    def __iter__(self) -> Iterator[int]:
+        for idx, _ in walk_number_tree(self._obj):
+            yield idx
-    def __getitem__(self, num: int) -> Any:
+    def __getitem__(self, num: int) -> PDFObject:
         for idx, val in walk_number_tree(self._obj, num):
             if idx == num:
                 return val
-        raise IndexError(f"Number {num} not in tree")
+        raise KeyError(f"Number {num} not in tree")
+    def items(self) -> NumberTreeItemsView:
+        return NumberTreeItemsView(self)
 def walk_name_tree(
-    tree: Dict[str, Any], key: Union[bytes, None] = None
-) -> Iterator[Tuple[bytes, Any]]:
+    tree: Dict[str, PDFObject], key: Union[bytes, None] = None
+) -> Iterator[Tuple[bytes, PDFObject]]:
     stack = [tree]
     while stack:
         item = dict_value(stack.pop())
@@ -63,26 +78,37 @@ def walk_name_tree(
             stack.extend(reversed(list_value(item["Kids"])))
-class NameTree:
+class NameTreeItemsView(ItemsView[bytes, PDFObject]):
+    _mapping: "NameTree"
+    def __iter__(self) -> Iterator[Tuple[bytes, PDFObject]]:
+        yield from walk_name_tree(self._mapping._obj)
+class NameTree(Mapping[bytes, PDFObject]):
     """A PDF name tree.
     See Section 7.9.6 of the PDF 1.7 Reference.
+    Raises:
+        TypeError: If initialized with a non-dictionary.
     """
-    def __init__(self, obj: Any):
+    def __init__(self, obj: PDFObject):
         self._obj = dict_value(obj)
-    def __iter__(self) -> Iterator[Tuple[bytes, Any]]:
-        return walk_name_tree(self._obj, None)
+    def __len__(self) -> int:
+        return sum(1 for _ in self)
-    def __contains__(self, name: bytes) -> bool:
-        for idx, val in self:
-            if idx == name:
-                return True
-        return False
+    def __iter__(self) -> Iterator[bytes]:
+        for name, _ in walk_name_tree(self._obj):
+            yield name
-    def __getitem__(self, name: bytes) -> Any:
-        for idx, val in self:
-            if idx == name:
+    def __getitem__(self, key: bytes) -> PDFObject:
+        for name, val in walk_name_tree(self._obj, key):
+            if name == key:
                 return val
-        raise IndexError("Name %r not in tree" % name)
+        raise KeyError("Name %r not in tree" % key)
+    def items(self) -> NameTreeItemsView:
+        return NameTreeItemsView(self)