playa-pdf 0.9.0__cp314-cp314-win_amd64.whl → 0.10.0__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
playa/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.9.0'
32
- __version_tuple__ = version_tuple = (0, 9, 0)
31
+ __version__ = version = '0.10.0'
32
+ __version_tuple__ = version_tuple = (0, 10, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None
Binary file
playa/arcfour.py CHANGED
@@ -4,13 +4,13 @@ This code is in the public domain.
4
4
 
5
5
  """
6
6
 
7
- from typing import Sequence
7
+ from typing import List, Sequence
8
8
 
9
9
 
10
10
  class Arcfour:
11
11
  def __init__(self, key: Sequence[int]) -> None:
12
12
  # because Py3 range is not indexable
13
- s = [i for i in range(256)]
13
+ s: List[int] = [i for i in range(256)]
14
14
  j = 0
15
15
  klen = len(key)
16
16
  for i in range(256):
@@ -23,7 +23,7 @@ class Arcfour:
23
23
  (i, j) = (self.i, self.j)
24
24
  s = self.s
25
25
  r = b""
26
- for c in iter(data):
26
+ for c in data:
27
27
  i = (i + 1) % 256
28
28
  j = (j + s[i]) % 256
29
29
  (s[i], s[j]) = (s[j], s[i])
@@ -31,5 +31,3 @@ class Arcfour:
31
31
  r += bytes((c ^ k,))
32
32
  (self.i, self.j) = (i, j)
33
33
  return r
34
-
35
- encrypt = decrypt = process
Binary file
playa/cmapdb.py CHANGED
@@ -196,11 +196,12 @@ class CMapDB:
196
196
  name = name.replace("\0", "")
197
197
  filename = "%s.pickle.gz" % name
198
198
  pklpath = (CMAP_DIR / filename).resolve()
199
- if not pklpath.is_relative_to(CMAP_DIR):
200
- raise KeyError(f"Ignoring malicious or malformed CMap {name}")
201
199
  try:
200
+ _ = pklpath.relative_to(CMAP_DIR)
202
201
  with gzip.open(pklpath) as gzfile:
203
202
  return pickle.load(gzfile)
203
+ except ValueError as e:
204
+ raise KeyError(f"Ignoring malicious or malformed CMap {name}") from e
204
205
  except FileNotFoundError as e:
205
206
  raise KeyError(f"CMap {name} not found in CMapDB") from e
206
207
 
playa/content.py CHANGED
@@ -4,6 +4,7 @@ PDF content objects created by the interpreter.
4
4
 
5
5
  import itertools
6
6
  import logging
7
+ import operator
7
8
  from abc import abstractmethod
8
9
  from copy import copy
9
10
  from dataclasses import dataclass
@@ -17,8 +18,10 @@ from typing import (
17
18
  Mapping,
18
19
  NamedTuple,
19
20
  Sequence,
21
+ Sized,
20
22
  Tuple,
21
23
  Union,
24
+ overload,
22
25
  )
23
26
 
24
27
  from playa.color import (
@@ -474,26 +477,6 @@ class ImageObject(ContentObject):
474
477
  LITERAL_TRANSPARENCY = LIT("Transparency")
475
478
 
476
479
 
477
- def _extract_mcid_texts(itor: Iterable[ContentObject]) -> Dict[int, List[str]]:
478
- """Get text for all MCIDs on a page or in a Form XObject"""
479
- mctext: Dict[int, List[str]] = {}
480
- for obj in itor:
481
- if not isinstance(obj, TextObject):
482
- continue
483
- mcs = obj.mcs
484
- if mcs is None or mcs.mcid is None:
485
- continue
486
- if "ActualText" in mcs.props:
487
- assert isinstance(mcs.props["ActualText"], bytes)
488
- chars = decode_text(mcs.props["ActualText"])
489
- else:
490
- chars = obj.chars
491
- # Remove soft hyphens
492
- chars = chars.replace("\xad", "")
493
- mctext.setdefault(mcs.mcid, []).append(chars)
494
- return mctext
495
-
496
-
497
480
  @dataclass
498
481
  class XObjectObject(ContentObject):
499
482
  """An eXternal Object, in the context of a page.
@@ -555,7 +538,7 @@ class XObjectObject(ContentObject):
555
538
  for pos, obj in ContentParser([self.stream], self.doc):
556
539
  yield obj
557
540
 
558
- def __iter__(self) -> Iterator["ContentObject"]:
541
+ def __iter__(self) -> Iterator[ContentObject]:
559
542
  from playa.interp import LazyInterpreter
560
543
 
561
544
  interp = LazyInterpreter(
@@ -625,55 +608,29 @@ class XObjectObject(ContentObject):
625
608
  return self._structmap
626
609
 
627
610
  @property
628
- def marked_content(self) -> Sequence[Union[None, Iterable["ContentObject"]]]:
629
- """Mapping of marked content IDs to iterators over content objects.
630
-
631
- These are the content objects associated with the structural
632
- elements in `XObjectObject.structure`. So, for instance, you can do:
611
+ def marked_content(self) -> "ContentSequence":
612
+ """A [`ContentSequence`][playa.content.ContentSequence] containing
613
+ content objects associated with the structural elements in
614
+ [`structure`][playa.content.XObjectObject.structure]. They
615
+ consist of a sequence with the same indices (these are the
616
+ marked content IDs) as the structure so can be zipped:
633
617
 
634
618
  for element, contents in zip(xobj.structure,
635
619
  xobj.marked_content):
636
- if element is not None:
637
- if contents is not None:
638
- for obj in contents:
639
- ... # do something with it
620
+ for obj in contents:
621
+ ... # do something with it
640
622
 
641
623
  Or you can also access the contents of a single element:
642
624
 
643
- if xobj.marked_content[mcid] is not None:
644
- for obj in xobj.marked_content[mcid]:
645
- ... # do something with it
625
+ for obj in xobj.marked_content[mcid]:
626
+ ... # do something with it
646
627
 
647
- Why do you have to check if it's `None`? Because the values
648
- are not necessarily sequences (they may just be positions in
649
- the content stream), it isn't possible to know if they are
650
- empty without iterating over them, which you may or may not
651
- want to do, because you are Lazy.
652
628
  """
653
- from playa.interp import _make_contentmap
654
-
655
629
  if hasattr(self, "_marked_contents"):
656
630
  return self._marked_contents
657
- self._marked_contents: Sequence[Union[None, Iterable["ContentObject"]]] = (
658
- _make_contentmap(self)
659
- )
631
+ self._marked_contents: ContentSequence = ContentSequence(self)
660
632
  return self._marked_contents
661
633
 
662
- @property
663
- def mcid_texts(self) -> Mapping[int, List[str]]:
664
- """Mapping of marked content IDs to Unicode text strings.
665
-
666
- For use in text extraction from tagged PDFs.
667
-
668
- Danger: Do not rely on this being a `dict`.
669
- Currently this is implemented eagerly, but in the future it
670
- may return a lazy object.
671
- """
672
- if hasattr(self, "_textmap"):
673
- return self._textmap
674
- self._textmap: Mapping[int, List[str]] = _extract_mcid_texts(self)
675
- return self._textmap
676
-
677
634
  @property
678
635
  def fonts(self) -> Mapping[str, Font]:
679
636
  """Mapping of resource names to fonts for this Form XObject.
@@ -687,19 +644,14 @@ class XObjectObject(ContentObject):
687
644
  generally considered to be globally unique, it may be
688
645
  possible to access fonts by them in the future.
689
646
 
690
- Danger: Do not rely on this being a `dict`.
691
- Currently this is implemented eagerly, but in the future it
692
- may return a lazy object which only loads fonts on demand.
693
-
694
647
  """
695
- from playa.interp import _make_fontmap
648
+ from playa.interp import FontMapping
696
649
 
697
650
  if hasattr(self, "_fontmap"):
698
651
  return self._fontmap
699
- if self.resources is None or "Font" not in self.resources:
700
- self._fontmap: Dict[str, Font] = {}
701
- else:
702
- self._fontmap = _make_fontmap(self.resources["Font"], self.doc)
652
+ self._fontmap: Mapping[str, Font] = FontMapping(
653
+ self.resources.get("Font") if self.resources else None, self.doc
654
+ )
703
655
  return self._fontmap
704
656
 
705
657
  @classmethod
@@ -806,7 +758,14 @@ class TextBase(ContentObject):
806
758
 
807
759
  @property
808
760
  @abstractmethod
809
- def matrix(self) -> Matrix: ...
761
+ def matrix(self) -> Matrix:
762
+ """Rendering matrix `T_rm`, which transforms text space coordinates to
763
+ device space (PDF 2.0 section 9.4.4)."""
764
+
765
+ @property
766
+ @abstractmethod
767
+ def displacement(self) -> Point:
768
+ """Vector to the origin of the next glyph in device space."""
810
769
 
811
770
  @property
812
771
  def font(self) -> Font:
@@ -1298,3 +1257,94 @@ class TextObject(TextBase):
1298
1257
  continue
1299
1258
  nglyphs += sum(1 for _ in font.decode(obj))
1300
1259
  return nglyphs
1260
+
1261
+
1262
+ class ContentSection(Iterable[ContentObject], Sized):
1263
+ """Sequence of content objects in a marked content section.
1264
+
1265
+ This is a `Sized` collection so that you can quickly check if it
1266
+ is non-empty by its truth value. The actual length may or may not
1267
+ be relevant.
1268
+ """
1269
+
1270
+ def __init__(self, objs: Iterable[ContentObject]) -> None:
1271
+ self._objs = [obj.finalize() for obj in objs]
1272
+ self._texts: Union[List[str], None] = None
1273
+
1274
+ def __len__(self) -> int:
1275
+ return len(self._objs)
1276
+
1277
+ def __iter__(self) -> Iterator[ContentObject]:
1278
+ return iter(self._objs)
1279
+
1280
+ @property
1281
+ def texts(self) -> Sequence[str]:
1282
+ """Sequence of text strings for a marked content section."""
1283
+ if self._texts is not None:
1284
+ return self._texts
1285
+ self._texts = []
1286
+ for obj in self._objs:
1287
+ if not isinstance(obj, TextObject):
1288
+ continue
1289
+ mcs = obj.mcs
1290
+ if mcs is None or mcs.mcid is None:
1291
+ continue
1292
+ if "ActualText" in mcs.props:
1293
+ assert isinstance(mcs.props["ActualText"], bytes)
1294
+ chars = decode_text(mcs.props["ActualText"])
1295
+ else:
1296
+ chars = obj.chars
1297
+ # Remove soft hyphens
1298
+ chars = chars.replace("\xad", "")
1299
+ self._texts.append(chars)
1300
+ return self._texts
1301
+
1302
+
1303
+ class ContentSequence(Sequence[ContentSection]):
1304
+ """Collect content object in marked content sections.
1305
+
1306
+ These are organized in a sequence and ordered by marked content
1307
+ ID, because this is the definition of "logical content order" and
1308
+ also defines the reading order of text.
1309
+
1310
+ You can also get them as an iterator in "page content order",
1311
+ i.e. the order in which they appeared in the actual content
1312
+ stream, using the `page_order` property.
1313
+
1314
+ """
1315
+
1316
+ def __init__(self, streamer: Iterable[ContentObject]) -> None:
1317
+ self._contents: Dict[int, ContentSection] = {}
1318
+ self._maxid: int = 0
1319
+ for mcid, objs in itertools.groupby(streamer, operator.attrgetter("mcid")):
1320
+ if mcid is None:
1321
+ continue
1322
+ # Python dicts preserve insertion order, but if there are
1323
+ # duplicate marked content sections (this is forbidden by
1324
+ # the spec, but.....) we can't do page content order
1325
+ self._contents[mcid] = ContentSection(objs)
1326
+ self._maxid = max(self._maxid, mcid)
1327
+
1328
+ def __len__(self) -> int:
1329
+ return self._maxid + 1
1330
+
1331
+ @property
1332
+ def page_order(self) -> Iterator[ContentSection]:
1333
+ """Marked content sections in page content order."""
1334
+ yield from self._contents.values()
1335
+
1336
+ @overload
1337
+ def __getitem__(self, mcid: int) -> ContentSection: ...
1338
+
1339
+ @overload
1340
+ def __getitem__(self, mcid: slice) -> Sequence[ContentSection]: ...
1341
+
1342
+ def __getitem__(
1343
+ self, mcid: Union[int, slice]
1344
+ ) -> Union[ContentSection, Sequence[ContentSection]]:
1345
+ if isinstance(mcid, slice):
1346
+ return [self[idx] for idx in range(mcid.start, mcid.stop, mcid.step)]
1347
+ else:
1348
+ if mcid > self._maxid:
1349
+ raise IndexError(f"Marked content ID {mcid} out of range")
1350
+ return self._contents.get(mcid, ContentSection([]))
playa/data_structures.py CHANGED
@@ -1,12 +1,16 @@
1
- from typing import Any, Dict, Iterator, Tuple, Union
1
+ from typing import Dict, Iterator, Mapping, Tuple, Union, ItemsView
2
2
 
3
- from playa.pdftypes import dict_value, int_value, list_value, str_value
3
+ from playa.pdftypes import PDFObject, dict_value, int_value, list_value, str_value
4
4
  from playa.utils import choplist
5
5
 
6
6
 
7
+ # TODO: NameTree and NumberTree are nearly identical and should be
8
+ # refactored to a single base class.
9
+
10
+
7
11
  def walk_number_tree(
8
- tree: Dict[str, Any], key: Union[int, None] = None
9
- ) -> Iterator[Tuple[int, Any]]:
12
+ tree: Dict[str, PDFObject], key: Union[int, None] = None
13
+ ) -> Iterator[Tuple[int, PDFObject]]:
10
14
  stack = [tree]
11
15
  while stack:
12
16
  item = dict_value(stack.pop())
@@ -21,34 +25,45 @@ def walk_number_tree(
21
25
  stack.extend(reversed(list_value(item["Kids"])))
22
26
 
23
27
 
24
- class NumberTree:
28
+ class NumberTreeItemsView(ItemsView[int, PDFObject]):
29
+ _mapping: "NumberTree"
30
+
31
+ def __iter__(self) -> Iterator[Tuple[int, PDFObject]]:
32
+ yield from walk_number_tree(self._mapping._obj)
33
+
34
+
35
+ class NumberTree(Mapping[int, PDFObject]):
25
36
  """A PDF number tree.
26
37
 
27
38
  See Section 7.9.7 of the PDF 1.7 Reference.
39
+
40
+ Raises:
41
+ TypeError: If initialized with a non-dictionary.
28
42
  """
29
43
 
30
- def __init__(self, obj: Any):
44
+ def __init__(self, obj: PDFObject):
31
45
  self._obj = dict_value(obj)
32
46
 
33
- def __iter__(self) -> Iterator[Tuple[int, Any]]:
34
- return walk_number_tree(self._obj)
47
+ def __len__(self) -> int:
48
+ return sum(1 for _ in self)
35
49
 
36
- def __contains__(self, num: int) -> bool:
37
- for idx, _ in walk_number_tree(self._obj, num):
38
- if idx == num:
39
- return True
40
- return False
50
+ def __iter__(self) -> Iterator[int]:
51
+ for idx, _ in walk_number_tree(self._obj):
52
+ yield idx
41
53
 
42
- def __getitem__(self, num: int) -> Any:
54
+ def __getitem__(self, num: int) -> PDFObject:
43
55
  for idx, val in walk_number_tree(self._obj, num):
44
56
  if idx == num:
45
57
  return val
46
- raise IndexError(f"Number {num} not in tree")
58
+ raise KeyError(f"Number {num} not in tree")
59
+
60
+ def items(self) -> NumberTreeItemsView:
61
+ return NumberTreeItemsView(self)
47
62
 
48
63
 
49
64
  def walk_name_tree(
50
- tree: Dict[str, Any], key: Union[bytes, None] = None
51
- ) -> Iterator[Tuple[bytes, Any]]:
65
+ tree: Dict[str, PDFObject], key: Union[bytes, None] = None
66
+ ) -> Iterator[Tuple[bytes, PDFObject]]:
52
67
  stack = [tree]
53
68
  while stack:
54
69
  item = dict_value(stack.pop())
@@ -63,26 +78,37 @@ def walk_name_tree(
63
78
  stack.extend(reversed(list_value(item["Kids"])))
64
79
 
65
80
 
66
- class NameTree:
81
+ class NameTreeItemsView(ItemsView[bytes, PDFObject]):
82
+ _mapping: "NameTree"
83
+
84
+ def __iter__(self) -> Iterator[Tuple[bytes, PDFObject]]:
85
+ yield from walk_name_tree(self._mapping._obj)
86
+
87
+
88
+ class NameTree(Mapping[bytes, PDFObject]):
67
89
  """A PDF name tree.
68
90
 
69
91
  See Section 7.9.6 of the PDF 1.7 Reference.
92
+
93
+ Raises:
94
+ TypeError: If initialized with a non-dictionary.
70
95
  """
71
96
 
72
- def __init__(self, obj: Any):
97
+ def __init__(self, obj: PDFObject):
73
98
  self._obj = dict_value(obj)
74
99
 
75
- def __iter__(self) -> Iterator[Tuple[bytes, Any]]:
76
- return walk_name_tree(self._obj, None)
100
+ def __len__(self) -> int:
101
+ return sum(1 for _ in self)
77
102
 
78
- def __contains__(self, name: bytes) -> bool:
79
- for idx, val in self:
80
- if idx == name:
81
- return True
82
- return False
103
+ def __iter__(self) -> Iterator[bytes]:
104
+ for name, _ in walk_name_tree(self._obj):
105
+ yield name
83
106
 
84
- def __getitem__(self, name: bytes) -> Any:
85
- for idx, val in self:
86
- if idx == name:
107
+ def __getitem__(self, key: bytes) -> PDFObject:
108
+ for name, val in walk_name_tree(self._obj, key):
109
+ if name == key:
87
110
  return val
88
- raise IndexError("Name %r not in tree" % name)
111
+ raise KeyError("Name %r not in tree" % key)
112
+
113
+ def items(self) -> NameTreeItemsView:
114
+ return NameTreeItemsView(self)