PyMuPDF 1.23.12__tar.gz → 1.23.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/PKG-INFO +1 -1
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/changes.txt +25 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/setup.py +1 -1
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src/__init__.py +20 -15
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src/table.py +274 -76
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/fitz_old.i +2 -2
- PyMuPDF-1.23.14/src_classic/version.i +7 -0
- PyMuPDF-1.23.14/tests/resources/test_2979.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_general.py +7 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_tables.py +21 -0
- PyMuPDF-1.23.12/src_classic/version.i +0 -7
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/COPYING +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/MANIFEST.in +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/README.md +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/READMErb.md +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/mupdf.tgz +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/pipcl.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/pyproject.toml +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/pytest.ini +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/scripts/gh_release.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/scripts/sysinstall.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/scripts/test.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src/__main__.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src/extra.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src/fitz.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src/utils.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/__init__.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/__main__.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/_config.h +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-annot.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-convert.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-defines.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-devices.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-fields.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-fileobj.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-geo-c.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-geo-py.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-globals.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-other.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-pdfinfo.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-pixmap.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-portfolio.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-python.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-select.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-stext.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/helper-xobject.i +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/src_classic/utils.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/README.md +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/001003ED.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/1.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/2.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/2201.00069.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/3.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/4.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/Bezier.epub +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/PragmaticaC.otf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/bug1945.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/bug1971.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/chinese-tables.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/chinese-tables.pickle +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/circular-toc.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/cython.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/cython.pickle +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/full_toc.txt +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/full_toc2.txt +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/github_sample.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/has-bad-fonts.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/image-file1.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/img-transparent.png +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/joined.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/metadata.txt +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/mupdf_explored.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/nur-ruhig.jpg +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/quad-calc-0.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/simple_toc.txt +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/symbol-list.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/symbols.txt +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test-2333.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test-2462.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test2093.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test2182.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test2238.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_1645_expected.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_1645_expected_1.22.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_1824.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2108.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2270.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2533.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2548.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2553-2.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2553.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2596.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2608_expected +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2634.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2635.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2645_1.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2645_2.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2645_3.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2710.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2730.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2788.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2791_content.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2791_coverpage.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2861.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2871.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2904.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2907.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2954.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2957_1.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2957_2.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_2969.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/test_delete_image.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/type3font.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/v110-changes.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/resources/widgettest.pdf +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/run_compound.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_2548.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_2634.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_2791.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_2904.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_2907.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_annots.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_badfonts.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_crypting.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_docs_samples.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_drawings.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_embeddedfiles.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_extractimage.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_flake8.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_font.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_geometry.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_imagebbox.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_insertimage.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_insertpdf.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_linequad.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_metadata.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_named_links.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_nonpdf.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_object_manipulation.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_optional_content.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_pagedelete.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_pagelabels.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_pixmap.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_showpdfpage.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_story.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_tesseract.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_textbox.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_textextract.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_textsearch.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_toc.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_widgets.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/tests/test_word_delimiters.py +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/valgrind.supp +0 -0
- {PyMuPDF-1.23.12 → PyMuPDF-1.23.14}/wdev.py +0 -0
|
@@ -2,6 +2,31 @@ Change Log
|
|
|
2
2
|
==========
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
**Changes in version 1.23.14 (2024-01-15)**
|
|
6
|
+
|
|
7
|
+
* Bug fixes:
|
|
8
|
+
|
|
9
|
+
* **Fixed** `3038 <https://github.com/pymupdf/PyMuPDF/issues/3038>`_: JM_pixmap_from_display_list > Assertion Error : Checking for wrong type
|
|
10
|
+
* **Fixed** `3039 <https://github.com/pymupdf/PyMuPDF/issues/3039>`_: Issue with doc.close() not closing the document in PyMuPDF
|
|
11
|
+
|
|
12
|
+
* Other:
|
|
13
|
+
|
|
14
|
+
* Ensure valid "re" rectangles in `Page.get_drawings()` with derotated pages.
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
**Changes in version 1.23.13 (2024-01-15)**
|
|
18
|
+
|
|
19
|
+
* Bug fixes:
|
|
20
|
+
|
|
21
|
+
* **Fixed** `2979 <https://github.com/pymupdf/PyMuPDF/issues/2979>`_: list index out of range in to_pandas()
|
|
22
|
+
* **Fixed** `3001 <https://github.com/pymupdf/PyMuPDF/issues/3001>`_: Calling find_tables() on one document alters the bounding boxes of a subsequent document
|
|
23
|
+
|
|
24
|
+
* Other:
|
|
25
|
+
|
|
26
|
+
* Fixed `Rect.height` and `Rect.width` to never return negative values.
|
|
27
|
+
* Fixed `TextPage.extractIMGINFO()`'s returned `dictkey_yres` value.
|
|
28
|
+
|
|
29
|
+
|
|
5
30
|
**Changes in version 1.23.12 (2024-01-12)**
|
|
6
31
|
|
|
7
32
|
* * **Fixed** `3027 <https://github.com/pymupdf/PyMuPDF/issues/3027>`_: Page.get_text throws Attribute Error for 'parent'
|
|
@@ -2460,7 +2460,9 @@ class DisplayList:
|
|
|
2460
2460
|
assert 0, f'Unrecognised {args=}'
|
|
2461
2461
|
|
|
2462
2462
|
def get_pixmap(self, matrix=None, colorspace=None, alpha=0, clip=None):
|
|
2463
|
-
if
|
|
2463
|
+
if isinstance(colorspace, Colorspace):
|
|
2464
|
+
colorspace = colorspace.this
|
|
2465
|
+
else:
|
|
2464
2466
|
colorspace = mupdf.FzColorspace(mupdf.FzColorspace.Fixed_RGB)
|
|
2465
2467
|
val = JM_pixmap_from_display_list(self.this, matrix, colorspace, alpha, clip, None)
|
|
2466
2468
|
val.thisown = True
|
|
@@ -3558,16 +3560,12 @@ class Document:
|
|
|
3558
3560
|
#self._reset_page_refs()
|
|
3559
3561
|
#self.metadata = None
|
|
3560
3562
|
#self.stream = None
|
|
3561
|
-
|
|
3563
|
+
self.is_closed = True
|
|
3562
3564
|
#self.FontInfos = []
|
|
3563
3565
|
#self.Graftmaps = {}
|
|
3564
3566
|
#self.ShownPages = {}
|
|
3565
3567
|
#self.InsertedImages = {}
|
|
3566
|
-
|
|
3567
3568
|
#self.this = None
|
|
3568
|
-
self.close_internal()
|
|
3569
|
-
|
|
3570
|
-
def close_internal(self):
|
|
3571
3569
|
self.this = None
|
|
3572
3570
|
|
|
3573
3571
|
def convert_to_pdf(self, from_page=0, to_page=-1, rotate=0):
|
|
@@ -8673,7 +8671,7 @@ class Page:
|
|
|
8673
8671
|
cmd = item[0]
|
|
8674
8672
|
rest = item[1:]
|
|
8675
8673
|
if cmd == "re":
|
|
8676
|
-
item = ("re", Rect(rest[0]), rest[1])
|
|
8674
|
+
item = ("re", Rect(rest[0]).normalize(), rest[1])
|
|
8677
8675
|
elif cmd == "qu":
|
|
8678
8676
|
item = ("qu", Quad(rest[0]))
|
|
8679
8677
|
else:
|
|
@@ -8807,7 +8805,7 @@ class Page:
|
|
|
8807
8805
|
cmd = item[0]
|
|
8808
8806
|
rest = item[1:]
|
|
8809
8807
|
if cmd == "re":
|
|
8810
|
-
item = ("re", Rect(rest[0]), rest[1])
|
|
8808
|
+
item = ("re", Rect(rest[0]).normalize(), rest[1])
|
|
8811
8809
|
elif cmd == "qu":
|
|
8812
8810
|
item = ("qu", Quad(rest[0]))
|
|
8813
8811
|
else:
|
|
@@ -10778,6 +10776,10 @@ class Rect:
|
|
|
10778
10776
|
"""Check if containing point-like or rect-like x."""
|
|
10779
10777
|
return self.__contains__(x)
|
|
10780
10778
|
|
|
10779
|
+
@property
|
|
10780
|
+
def height(self):
|
|
10781
|
+
return max(0, self.y1 - self.y0)
|
|
10782
|
+
|
|
10781
10783
|
def include_point(self, p):
|
|
10782
10784
|
"""Extend to include point-like p."""
|
|
10783
10785
|
if len(p) != 2:
|
|
@@ -10899,15 +10901,17 @@ class Rect:
|
|
|
10899
10901
|
self.x0, self.y0, self.x1, self.y1 = util_transform_rect(self, m)
|
|
10900
10902
|
return self
|
|
10901
10903
|
|
|
10904
|
+
@property
|
|
10905
|
+
def width(self):
|
|
10906
|
+
return max(0, self.x1 - self.x0)
|
|
10907
|
+
|
|
10902
10908
|
__div__ = __truediv__
|
|
10903
10909
|
|
|
10904
10910
|
bl = bottom_left
|
|
10905
10911
|
br = bottom_right
|
|
10906
|
-
height = property(lambda self: abs(self.y1 - self.y0))
|
|
10907
10912
|
irect = property(round)
|
|
10908
10913
|
tl = top_left
|
|
10909
10914
|
tr = top_right
|
|
10910
|
-
width = property(lambda self: abs(self.x1 - self.x0))
|
|
10911
10915
|
|
|
10912
10916
|
|
|
10913
10917
|
class Shape:
|
|
@@ -12518,7 +12522,7 @@ class TextPage:
|
|
|
12518
12522
|
block_dict[ dictkey_colorspace] = mupdf.fz_colorspace_n(cs)
|
|
12519
12523
|
block_dict[ dictkey_cs_name] = mupdf.fz_colorspace_name(cs)
|
|
12520
12524
|
block_dict[ dictkey_xres] = img.xres()
|
|
12521
|
-
block_dict[ dictkey_yres] = img.
|
|
12525
|
+
block_dict[ dictkey_yres] = img.yres()
|
|
12522
12526
|
block_dict[ dictkey_bpc] = img.bpc()
|
|
12523
12527
|
block_dict[ dictkey_size] = mupdf.fz_image_size(img)
|
|
12524
12528
|
if hashes:
|
|
@@ -16786,8 +16790,9 @@ def JM_pixmap_from_display_list(
|
|
|
16786
16790
|
rect = mupdf.fz_transform_rect(rect, matrix)
|
|
16787
16791
|
irect = mupdf.fz_round_rect(rect)
|
|
16788
16792
|
|
|
16789
|
-
assert isinstance( cs,
|
|
16790
|
-
|
|
16793
|
+
assert isinstance( cs, mupdf.FzColorspace)
|
|
16794
|
+
|
|
16795
|
+
pix = mupdf.fz_new_pixmap_with_bbox(cs, irect, seps, alpha)
|
|
16791
16796
|
if alpha:
|
|
16792
16797
|
mupdf.fz_clear_pixmap(pix)
|
|
16793
16798
|
else:
|
|
@@ -21734,8 +21739,8 @@ def int_rc(text):
|
|
|
21734
21739
|
return int(text)
|
|
21735
21740
|
|
|
21736
21741
|
VersionFitz = "1.23.9" # MuPDF version.
|
|
21737
|
-
VersionBind = "1.23.
|
|
21738
|
-
VersionDate = "2024-01-
|
|
21742
|
+
VersionBind = "1.23.14" # PyMuPDF version.
|
|
21743
|
+
VersionDate = "2024-01-15 00:00:01"
|
|
21739
21744
|
VersionDate2 = VersionDate.replace('-', '').replace(' ', '').replace(':', '')
|
|
21740
21745
|
version = (VersionBind, VersionFitz, VersionDate2)
|
|
21741
21746
|
pymupdf_version_tuple = tuple( [int_rc(i) for i in VersionBind.split('.')])
|
|
@@ -71,8 +71,10 @@ This is implemented as new class TableHeader with the properties:
|
|
|
71
71
|
* external: A bool indicating whether the header is outside the table cells.
|
|
72
72
|
|
|
73
73
|
"""
|
|
74
|
+
import inspect
|
|
74
75
|
import itertools
|
|
75
76
|
import string
|
|
77
|
+
from collections.abc import Sequence
|
|
76
78
|
from dataclasses import dataclass
|
|
77
79
|
from operator import itemgetter
|
|
78
80
|
|
|
@@ -91,7 +93,6 @@ from . import (
|
|
|
91
93
|
|
|
92
94
|
EDGES = [] # vector graphics from PyMuPDF
|
|
93
95
|
CHARS = [] # text characters from PyMuPDF
|
|
94
|
-
TEXTPAGE = None # TextPage of the page for optimized extraction
|
|
95
96
|
# -------------------------------------------------------------------
|
|
96
97
|
# End of PyMuPDF interface code
|
|
97
98
|
# -------------------------------------------------------------------
|
|
@@ -141,6 +142,18 @@ LIGATURES = {
|
|
|
141
142
|
}
|
|
142
143
|
|
|
143
144
|
|
|
145
|
+
def to_list(collection) -> list:
|
|
146
|
+
if isinstance(collection, list):
|
|
147
|
+
return collection
|
|
148
|
+
elif isinstance(collection, Sequence):
|
|
149
|
+
return list(collection)
|
|
150
|
+
elif hasattr(collection, "to_dict"):
|
|
151
|
+
res = collection.to_dict("records") # pragma: nocover
|
|
152
|
+
return res
|
|
153
|
+
else:
|
|
154
|
+
return list(collection)
|
|
155
|
+
|
|
156
|
+
|
|
144
157
|
class TextMap:
|
|
145
158
|
"""
|
|
146
159
|
A TextMap maps each unicode character in the text to an individual `char`
|
|
@@ -546,6 +559,78 @@ def extract_words(chars: list, **kwargs) -> list:
|
|
|
546
559
|
return WordExtractor(**kwargs).extract_words(chars)
|
|
547
560
|
|
|
548
561
|
|
|
562
|
+
TEXTMAP_KWARGS = inspect.signature(WordMap.to_textmap).parameters.keys()
|
|
563
|
+
WORD_EXTRACTOR_KWARGS = inspect.signature(WordExtractor).parameters.keys()
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def chars_to_textmap(chars: list, **kwargs) -> TextMap:
|
|
567
|
+
kwargs.update({"presorted": True})
|
|
568
|
+
|
|
569
|
+
extractor = WordExtractor(
|
|
570
|
+
**{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
|
|
571
|
+
)
|
|
572
|
+
wordmap = extractor.extract_wordmap(chars)
|
|
573
|
+
textmap = wordmap.to_textmap(
|
|
574
|
+
**{k: kwargs[k] for k in TEXTMAP_KWARGS if k in kwargs}
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
return textmap
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def extract_text(chars: list, **kwargs) -> str:
|
|
581
|
+
chars = to_list(chars)
|
|
582
|
+
if len(chars) == 0:
|
|
583
|
+
return ""
|
|
584
|
+
|
|
585
|
+
if kwargs.get("layout"):
|
|
586
|
+
return chars_to_textmap(chars, **kwargs).as_string
|
|
587
|
+
else:
|
|
588
|
+
y_tolerance = kwargs.get("y_tolerance", DEFAULT_Y_TOLERANCE)
|
|
589
|
+
extractor = WordExtractor(
|
|
590
|
+
**{k: kwargs[k] for k in WORD_EXTRACTOR_KWARGS if k in kwargs}
|
|
591
|
+
)
|
|
592
|
+
words = extractor.extract_words(chars)
|
|
593
|
+
lines = cluster_objects(words, itemgetter("doctop"), y_tolerance)
|
|
594
|
+
return "\n".join(" ".join(word["text"] for word in line) for line in lines)
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def collate_line(
|
|
598
|
+
line_chars: list,
|
|
599
|
+
tolerance=DEFAULT_X_TOLERANCE,
|
|
600
|
+
) -> str:
|
|
601
|
+
coll = ""
|
|
602
|
+
last_x1 = None
|
|
603
|
+
for char in sorted(line_chars, key=itemgetter("x0")):
|
|
604
|
+
if (last_x1 is not None) and (char["x0"] > (last_x1 + tolerance)):
|
|
605
|
+
coll += " "
|
|
606
|
+
last_x1 = char["x1"]
|
|
607
|
+
coll += char["text"]
|
|
608
|
+
return coll
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def dedupe_chars(chars: list, tolerance=1) -> list:
|
|
612
|
+
"""
|
|
613
|
+
Removes duplicate chars — those sharing the same text, fontname, size,
|
|
614
|
+
and positioning (within `tolerance`) as other characters in the set.
|
|
615
|
+
"""
|
|
616
|
+
key = itemgetter("fontname", "size", "upright", "text")
|
|
617
|
+
pos_key = itemgetter("doctop", "x0")
|
|
618
|
+
|
|
619
|
+
def yield_unique_chars(chars: list):
|
|
620
|
+
sorted_chars = sorted(chars, key=key)
|
|
621
|
+
for grp, grp_chars in itertools.groupby(sorted_chars, key=key):
|
|
622
|
+
for y_cluster in cluster_objects(
|
|
623
|
+
list(grp_chars), itemgetter("doctop"), tolerance
|
|
624
|
+
):
|
|
625
|
+
for x_cluster in cluster_objects(
|
|
626
|
+
y_cluster, itemgetter("x0"), tolerance
|
|
627
|
+
):
|
|
628
|
+
yield sorted(x_cluster, key=pos_key)[0]
|
|
629
|
+
|
|
630
|
+
deduped = yield_unique_chars(chars)
|
|
631
|
+
return sorted(deduped, key=chars.index)
|
|
632
|
+
|
|
633
|
+
|
|
549
634
|
def line_to_edge(line):
|
|
550
635
|
edge = dict(line)
|
|
551
636
|
edge["orientation"] = "h" if (line["top"] == line["bottom"]) else "v"
|
|
@@ -1162,35 +1247,22 @@ class Table(object):
|
|
|
1162
1247
|
|
|
1163
1248
|
@property
|
|
1164
1249
|
def bbox(self):
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1250
|
+
c = self.cells
|
|
1251
|
+
return (
|
|
1252
|
+
min(map(itemgetter(0), c)),
|
|
1253
|
+
min(map(itemgetter(1), c)),
|
|
1254
|
+
max(map(itemgetter(2), c)),
|
|
1255
|
+
max(map(itemgetter(3), c)),
|
|
1256
|
+
)
|
|
1170
1257
|
|
|
1171
1258
|
@property
|
|
1172
1259
|
def rows(self) -> list:
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
if rot == 0:
|
|
1176
|
-
# sort by y, then by x
|
|
1177
|
-
i1, i2, f1, f2 = 1, 0, 1, 1
|
|
1178
|
-
elif rot == 90:
|
|
1179
|
-
# sort by x, then by y (desc)
|
|
1180
|
-
i1, i2, f1, f2 = 0, 1, -1, 1
|
|
1181
|
-
elif rot == 270:
|
|
1182
|
-
# sort by x (desc), then by y (asc)
|
|
1183
|
-
i1, i2, f1, f2 = 0, 1, 1, -1
|
|
1184
|
-
elif rot == 180:
|
|
1185
|
-
# sort by y (desc), then by x (desc)
|
|
1186
|
-
i1, i2, f1, f2 = 1, 0, -1, -1
|
|
1187
|
-
|
|
1188
|
-
xs = sorted(list(set([c[i1] for c in self.cells])), key=lambda x: f2 * x)
|
|
1260
|
+
_sorted = sorted(self.cells, key=itemgetter(1, 0))
|
|
1261
|
+
xs = list(sorted(set(map(itemgetter(0), self.cells))))
|
|
1189
1262
|
rows = []
|
|
1190
|
-
for
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
)
|
|
1263
|
+
for y, row_cells in itertools.groupby(_sorted, itemgetter(1)):
|
|
1264
|
+
xdict = {cell[0]: cell for cell in row_cells}
|
|
1265
|
+
row = TableRow([xdict.get(x) for x in xs])
|
|
1194
1266
|
rows.append(row)
|
|
1195
1267
|
return rows
|
|
1196
1268
|
|
|
@@ -1202,55 +1274,46 @@ class Table(object):
|
|
|
1202
1274
|
def col_count(self) -> int: # PyMuPDF extension
|
|
1203
1275
|
return max([len(r.cells) for r in self.rows])
|
|
1204
1276
|
|
|
1205
|
-
def extract(self) -> list:
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
Complete replacement by PyMuPDF text extraction.
|
|
1209
|
-
"""
|
|
1210
|
-
global TEXTPAGE
|
|
1211
|
-
|
|
1212
|
-
def get_text(cell):
|
|
1213
|
-
"""Accept char bbox areas with a cell overlap of at least 50%."""
|
|
1214
|
-
cell = Rect(cell) # we need a Rect object
|
|
1215
|
-
text = "" # result text
|
|
1216
|
-
for block in TEXTPAGE.extractRAWDICT()["blocks"]:
|
|
1217
|
-
if Rect(block["bbox"]).intersect(cell).is_empty:
|
|
1218
|
-
continue
|
|
1219
|
-
for line in block["lines"]:
|
|
1220
|
-
if Rect(line["bbox"]).intersect(cell).is_empty:
|
|
1221
|
-
continue
|
|
1222
|
-
for span in line["spans"]:
|
|
1223
|
-
chars = span["chars"]
|
|
1224
|
-
if text and chars:
|
|
1225
|
-
text += "\n" # new span appended after linebreak
|
|
1226
|
-
for char in chars:
|
|
1227
|
-
bbox = Rect(char["bbox"])
|
|
1228
|
-
if abs(bbox & cell) < 0.5 * abs(bbox):
|
|
1229
|
-
continue
|
|
1230
|
-
text += char["c"]
|
|
1231
|
-
|
|
1232
|
-
# no final line break, no wrapping spaces
|
|
1233
|
-
return text.rstrip("\n").strip()
|
|
1277
|
+
def extract(self, **kwargs) -> list:
|
|
1278
|
+
chars = CHARS
|
|
1279
|
+
table_arr = []
|
|
1234
1280
|
|
|
1235
|
-
|
|
1281
|
+
def char_in_bbox(char, bbox) -> bool:
|
|
1282
|
+
v_mid = (char["top"] + char["bottom"]) / 2
|
|
1283
|
+
h_mid = (char["x0"] + char["x1"]) / 2
|
|
1284
|
+
x0, top, x1, bottom = bbox
|
|
1285
|
+
return bool(
|
|
1286
|
+
(h_mid >= x0) and (h_mid < x1) and (v_mid >= top) and (v_mid < bottom)
|
|
1287
|
+
)
|
|
1236
1288
|
|
|
1237
1289
|
for row in self.rows:
|
|
1238
|
-
arr = []
|
|
1290
|
+
arr = []
|
|
1291
|
+
row_chars = [char for char in chars if char_in_bbox(char, row.bbox)]
|
|
1292
|
+
|
|
1239
1293
|
for cell in row.cells:
|
|
1240
1294
|
if cell is None:
|
|
1241
1295
|
cell_text = None
|
|
1242
1296
|
else:
|
|
1243
|
-
|
|
1297
|
+
cell_chars = [
|
|
1298
|
+
char for char in row_chars if char_in_bbox(char, cell)
|
|
1299
|
+
]
|
|
1300
|
+
|
|
1301
|
+
if len(cell_chars):
|
|
1302
|
+
kwargs["x_shift"] = cell[0]
|
|
1303
|
+
kwargs["y_shift"] = cell[1]
|
|
1304
|
+
if "layout" in kwargs:
|
|
1305
|
+
kwargs["layout_width"] = cell[2] - cell[0]
|
|
1306
|
+
kwargs["layout_height"] = cell[3] - cell[1]
|
|
1307
|
+
cell_text = extract_text(cell_chars, **kwargs)
|
|
1308
|
+
else:
|
|
1309
|
+
cell_text = ""
|
|
1244
1310
|
arr.append(cell_text)
|
|
1245
1311
|
table_arr.append(arr)
|
|
1246
1312
|
|
|
1247
1313
|
return table_arr
|
|
1248
1314
|
|
|
1249
|
-
def to_pandas(self):
|
|
1250
|
-
"""Return a pandas DataFrame version of the table.
|
|
1251
|
-
|
|
1252
|
-
This is original PyMuPDF code.
|
|
1253
|
-
"""
|
|
1315
|
+
def to_pandas(self, **kwargs):
|
|
1316
|
+
"""Return a pandas DataFrame version of the table."""
|
|
1254
1317
|
try:
|
|
1255
1318
|
import pandas as pd
|
|
1256
1319
|
except ModuleNotFoundError:
|
|
@@ -1362,9 +1425,6 @@ class Table(object):
|
|
|
1362
1425
|
cells.append((x0, y0, x1, y1))
|
|
1363
1426
|
return cells, bbox
|
|
1364
1427
|
|
|
1365
|
-
# we depend on small glyph heights!
|
|
1366
|
-
old_small = TOOLS.set_small_glyph_heights()
|
|
1367
|
-
TOOLS.set_small_glyph_heights(True)
|
|
1368
1428
|
try:
|
|
1369
1429
|
row = self.rows[0]
|
|
1370
1430
|
cells = row.cells
|
|
@@ -1509,7 +1569,6 @@ class Table(object):
|
|
|
1509
1569
|
page.get_textbox(c).replace("\n", " ").replace(" ", " ").strip()
|
|
1510
1570
|
for c in hdr_cells
|
|
1511
1571
|
]
|
|
1512
|
-
TOOLS.set_small_glyph_heights(old_small)
|
|
1513
1572
|
return TableHeader(tuple(hdr_bbox), hdr_cells, hdr_names, True)
|
|
1514
1573
|
|
|
1515
1574
|
|
|
@@ -1756,14 +1815,11 @@ page information themselves.
|
|
|
1756
1815
|
# -----------------------------------------------------------------------------
|
|
1757
1816
|
def make_chars(page, clip=None):
|
|
1758
1817
|
"""Extract text as "rawdict" to fill CHARS."""
|
|
1759
|
-
global CHARS
|
|
1760
|
-
old_small = TOOLS.set_small_glyph_heights()
|
|
1761
|
-
TOOLS.set_small_glyph_heights(True)
|
|
1818
|
+
global CHARS
|
|
1762
1819
|
page_number = page.number + 1
|
|
1763
1820
|
page_height = page.rect.height
|
|
1764
1821
|
ctm = page.transformation_matrix
|
|
1765
|
-
|
|
1766
|
-
blocks = TEXTPAGE.extractRAWDICT()["blocks"]
|
|
1822
|
+
blocks = page.get_text("rawdict", clip=clip, flags=TEXTFLAGS_TEXT)["blocks"]
|
|
1767
1823
|
doctop_base = page_height * page.number
|
|
1768
1824
|
for block in blocks:
|
|
1769
1825
|
for line in block["lines"]:
|
|
@@ -1810,8 +1866,6 @@ def make_chars(page, clip=None):
|
|
|
1810
1866
|
}
|
|
1811
1867
|
CHARS.append(char_dict)
|
|
1812
1868
|
|
|
1813
|
-
TOOLS.set_small_glyph_heights(old_small)
|
|
1814
|
-
|
|
1815
1869
|
|
|
1816
1870
|
# -----------------------------------------------------------------------------
|
|
1817
1871
|
# Extract all page vector graphics to fill the EDGES list.
|
|
@@ -1819,8 +1873,56 @@ def make_chars(page, clip=None):
|
|
|
1819
1873
|
# else to lines.
|
|
1820
1874
|
# -----------------------------------------------------------------------------
|
|
1821
1875
|
def make_edges(page, clip=None, tset=None):
|
|
1876
|
+
def has_text(bbox):
|
|
1877
|
+
text = page.get_text(clip=bbox).replace("\n", "").strip()
|
|
1878
|
+
if text:
|
|
1879
|
+
return True
|
|
1880
|
+
return False
|
|
1881
|
+
|
|
1882
|
+
def clean_graphics():
|
|
1883
|
+
"""Detect and join rectangles of connected vector graphics."""
|
|
1884
|
+
# we need to exclude meaningless graphics that e.g. paint a white
|
|
1885
|
+
# rectangle on the full page.
|
|
1886
|
+
|
|
1887
|
+
parea = abs(page.rect) * 0.8 # area of the full page (80%)
|
|
1888
|
+
|
|
1889
|
+
# exclude graphics that are too large
|
|
1890
|
+
paths = [p for p in page.get_drawings() if abs(p["rect"]) < parea]
|
|
1891
|
+
|
|
1892
|
+
# make a list of vector graphics rectangles (IRects are sufficient)
|
|
1893
|
+
prects = sorted([p["rect"] for p in paths], key=lambda r: (r.y1, r.x0))
|
|
1894
|
+
|
|
1895
|
+
new_rects = [] # the final list of joined rectangles
|
|
1896
|
+
|
|
1897
|
+
# -------------------------------------------------------------------------
|
|
1898
|
+
# Strategy: join rects that have at least one point in common.
|
|
1899
|
+
# -------------------------------------------------------------------------
|
|
1900
|
+
while prects: # the algorithm will empty this list
|
|
1901
|
+
r = prects[0] # first rectangle
|
|
1902
|
+
repeat = True
|
|
1903
|
+
while repeat:
|
|
1904
|
+
repeat = False
|
|
1905
|
+
for i in range(len(prects) - 1, -1, -1): # run backwards
|
|
1906
|
+
if i == 0: # don't touch first rectangle
|
|
1907
|
+
continue
|
|
1908
|
+
if r.intersects(prects[i]):
|
|
1909
|
+
r |= prects[i] # join in to first rect
|
|
1910
|
+
prects[0] = +r # update first
|
|
1911
|
+
del prects[i] # delete this rect
|
|
1912
|
+
repeat = True
|
|
1913
|
+
|
|
1914
|
+
# move first item over to result list
|
|
1915
|
+
new_rects.append(prects.pop(0))
|
|
1916
|
+
prects = sorted(list(set(prects)), key=lambda r: (r.y1, r.x0))
|
|
1917
|
+
|
|
1918
|
+
new_rects = sorted(list(set(new_rects)), key=lambda r: (r.y1, r.x0))
|
|
1919
|
+
return [
|
|
1920
|
+
r for r in new_rects if r.width > 5 and r.height > 5 and has_text(r)
|
|
1921
|
+
], paths
|
|
1922
|
+
|
|
1822
1923
|
global EDGES
|
|
1823
|
-
paths =
|
|
1924
|
+
bboxes, paths = clean_graphics()
|
|
1925
|
+
|
|
1824
1926
|
page_height = page.rect.height
|
|
1825
1927
|
doctop_basis = page.number * page_height
|
|
1826
1928
|
page_number = page.number + 1
|
|
@@ -1896,6 +1998,14 @@ def make_edges(page, clip=None, tset=None):
|
|
|
1896
1998
|
return line_dict
|
|
1897
1999
|
|
|
1898
2000
|
for p in paths:
|
|
2001
|
+
if p["type"] == "f" and p["fill"] == (1, 1, 1):
|
|
2002
|
+
continue
|
|
2003
|
+
if p["type"] == "f" and p["rect"].width > 3 and p["rect"].height > 3:
|
|
2004
|
+
if (
|
|
2005
|
+
tset.vertical_strategy == "lines_strict"
|
|
2006
|
+
or tset.horizontal_strategy == "lines_strict"
|
|
2007
|
+
):
|
|
2008
|
+
continue
|
|
1899
2009
|
items = p["items"] # items in this path
|
|
1900
2010
|
|
|
1901
2011
|
# if 'closePath', add a line from last to first point
|
|
@@ -1913,7 +2023,7 @@ def make_edges(page, clip=None, tset=None):
|
|
|
1913
2023
|
EDGES.append(line_to_edge(line_dict))
|
|
1914
2024
|
|
|
1915
2025
|
elif i[0] == "re": # a rectangle: decompose into 4 lines
|
|
1916
|
-
rect = i[1] # rectangle itself
|
|
2026
|
+
rect = i[1].normalize() # rectangle itself
|
|
1917
2027
|
# ignore minute rectangles
|
|
1918
2028
|
if rect.height <= y_tolerance and rect.width <= x_tolerance:
|
|
1919
2029
|
continue
|
|
@@ -1972,6 +2082,77 @@ def make_edges(page, clip=None, tset=None):
|
|
|
1972
2082
|
if line_dict:
|
|
1973
2083
|
EDGES.append(line_to_edge(line_dict))
|
|
1974
2084
|
|
|
2085
|
+
path = {"color": (0, 0, 0), "fill": None, "width": 1}
|
|
2086
|
+
for bbox in bboxes:
|
|
2087
|
+
line_dict = make_line(path, bbox.tl, bbox.tr, clip)
|
|
2088
|
+
EDGES.append(line_to_edge(line_dict))
|
|
2089
|
+
|
|
2090
|
+
line_dict = make_line(path, bbox.bl, bbox.br, clip)
|
|
2091
|
+
EDGES.append(line_to_edge(line_dict))
|
|
2092
|
+
|
|
2093
|
+
line_dict = make_line(path, bbox.tl, bbox.bl, clip)
|
|
2094
|
+
EDGES.append(line_to_edge(line_dict))
|
|
2095
|
+
|
|
2096
|
+
line_dict = make_line(path, bbox.tr, bbox.br, clip)
|
|
2097
|
+
EDGES.append(line_to_edge(line_dict))
|
|
2098
|
+
|
|
2099
|
+
|
|
2100
|
+
def page_rotation_set0(page):
|
|
2101
|
+
"""Nullify page rotation.
|
|
2102
|
+
|
|
2103
|
+
To correctly detect tables, page rotation must be zero.
|
|
2104
|
+
This function performs the necessary adjustments and returns information
|
|
2105
|
+
for reverting this changes.
|
|
2106
|
+
"""
|
|
2107
|
+
mediabox = page.mediabox
|
|
2108
|
+
rot = page.rotation # contains normalized rotation value
|
|
2109
|
+
# need to derotate the page's content
|
|
2110
|
+
mb = page.mediabox # current mediabox
|
|
2111
|
+
|
|
2112
|
+
if rot == 90:
|
|
2113
|
+
# before derotation, shift content horizontally
|
|
2114
|
+
mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0)
|
|
2115
|
+
elif rot == 270:
|
|
2116
|
+
# before derotation, shift content vertically
|
|
2117
|
+
mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0)
|
|
2118
|
+
else:
|
|
2119
|
+
mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0)
|
|
2120
|
+
|
|
2121
|
+
# prefix with derotation matrix
|
|
2122
|
+
mat = mat0 * page.derotation_matrix
|
|
2123
|
+
cmd = b"%g %g %g %g %g %g cm " % tuple(mat)
|
|
2124
|
+
xref = TOOLS._insert_contents(page, cmd, 0)
|
|
2125
|
+
|
|
2126
|
+
# swap x- and y-coordinates
|
|
2127
|
+
if rot in (90, 270):
|
|
2128
|
+
x0, y0, x1, y1 = mb
|
|
2129
|
+
mb.x0 = y0
|
|
2130
|
+
mb.y0 = x0
|
|
2131
|
+
mb.x1 = y1
|
|
2132
|
+
mb.y1 = x1
|
|
2133
|
+
page.set_mediabox(mb)
|
|
2134
|
+
|
|
2135
|
+
page.set_rotation(0)
|
|
2136
|
+
|
|
2137
|
+
# refresh the page to apply these changes
|
|
2138
|
+
doc = page.parent
|
|
2139
|
+
pno = page.number
|
|
2140
|
+
page = doc[pno]
|
|
2141
|
+
return page, xref, rot, mediabox
|
|
2142
|
+
|
|
2143
|
+
|
|
2144
|
+
def page_rotation_reset(page, xref, rot, mediabox):
|
|
2145
|
+
"""Reset page rotation to original values.
|
|
2146
|
+
|
|
2147
|
+
To be used before we return tabes."""
|
|
2148
|
+
doc = page.parent # document of the page
|
|
2149
|
+
doc.update_object(xref, "<<>>") # remove modifying matrix
|
|
2150
|
+
page.set_mediabox(mediabox) # set mediabox to old value
|
|
2151
|
+
page.set_rotation(rot) # set rotation to old value
|
|
2152
|
+
pno = page.number
|
|
2153
|
+
page = doc[pno] # update page info
|
|
2154
|
+
return page
|
|
2155
|
+
|
|
1975
2156
|
|
|
1976
2157
|
def find_tables(
|
|
1977
2158
|
page,
|
|
@@ -1995,10 +2176,18 @@ def find_tables(
|
|
|
1995
2176
|
text_tolerance=3,
|
|
1996
2177
|
text_x_tolerance=3,
|
|
1997
2178
|
text_y_tolerance=3,
|
|
2179
|
+
strategy=None, # offer abbreviation
|
|
1998
2180
|
):
|
|
1999
2181
|
global CHARS, EDGES
|
|
2000
2182
|
CHARS = []
|
|
2001
2183
|
EDGES = []
|
|
2184
|
+
old_small = bool(TOOLS.set_small_glyph_heights()) # save old value
|
|
2185
|
+
TOOLS.set_small_glyph_heights(True) # we need minimum bboxes
|
|
2186
|
+
if page.rotation != 0:
|
|
2187
|
+
page, old_xref, old_rot, old_mediabox = page_rotation_set0(page)
|
|
2188
|
+
else:
|
|
2189
|
+
old_xref, old_rot, old_mediabox = None, None, None
|
|
2190
|
+
|
|
2002
2191
|
if snap_x_tolerance is None:
|
|
2003
2192
|
snap_x_tolerance = UNSET
|
|
2004
2193
|
if snap_y_tolerance is None:
|
|
@@ -2011,6 +2200,10 @@ def find_tables(
|
|
|
2011
2200
|
intersection_x_tolerance = UNSET
|
|
2012
2201
|
if intersection_y_tolerance is None:
|
|
2013
2202
|
intersection_y_tolerance = UNSET
|
|
2203
|
+
if strategy is not None:
|
|
2204
|
+
vertical_strategy = strategy
|
|
2205
|
+
horizontal_strategy = strategy
|
|
2206
|
+
|
|
2014
2207
|
settings = {
|
|
2015
2208
|
"vertical_strategy": vertical_strategy,
|
|
2016
2209
|
"horizontal_strategy": horizontal_strategy,
|
|
@@ -2034,7 +2227,12 @@ def find_tables(
|
|
|
2034
2227
|
}
|
|
2035
2228
|
tset = TableSettings.resolve(settings=settings)
|
|
2036
2229
|
page.table_settings = tset
|
|
2230
|
+
|
|
2037
2231
|
make_chars(page, clip=clip) # create character list of page
|
|
2038
2232
|
make_edges(page, clip=clip, tset=tset) # create lines and curves
|
|
2039
2233
|
tables = TableFinder(page, settings=tset)
|
|
2234
|
+
|
|
2235
|
+
TOOLS.set_small_glyph_heights(old_small)
|
|
2236
|
+
if old_xref is not None:
|
|
2237
|
+
page = page_rotation_reset(page, old_xref, old_rot, old_mediabox)
|
|
2040
2238
|
return tables
|
|
@@ -6344,7 +6344,7 @@ def get_oc_items(self) -> list:
|
|
|
6344
6344
|
cmd = item[0]
|
|
6345
6345
|
rest = item[1:]
|
|
6346
6346
|
if cmd == "re":
|
|
6347
|
-
item = ("re", Rect(rest[0]), rest[1])
|
|
6347
|
+
item = ("re", Rect(rest[0]).normalize(), rest[1])
|
|
6348
6348
|
elif cmd == "qu":
|
|
6349
6349
|
item = ("qu", Quad(rest[0]))
|
|
6350
6350
|
else:
|
|
@@ -6479,7 +6479,7 @@ def get_oc_items(self) -> list:
|
|
|
6479
6479
|
cmd = item[0]
|
|
6480
6480
|
rest = item[1:]
|
|
6481
6481
|
if cmd == "re":
|
|
6482
|
-
item = ("re", Rect(rest[0]), rest[1])
|
|
6482
|
+
item = ("re", Rect(rest[0]).normalize(), rest[1])
|
|
6483
6483
|
elif cmd == "qu":
|
|
6484
6484
|
item = ("qu", Quad(rest[0]))
|
|
6485
6485
|
else:
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
%pythoncode %{
|
|
2
|
+
VersionFitz = "1.23.9" # MuPDF version.
|
|
3
|
+
VersionBind = "1.23.14" # PyMuPDF version.
|
|
4
|
+
VersionDate = "2024-01-15 00:00:01"
|
|
5
|
+
version = (VersionBind, VersionFitz, "20240115000001")
|
|
6
|
+
pymupdf_version_tuple = tuple( [int(i) for i in VersionFitz.split('.')])
|
|
7
|
+
%}
|
|
Binary file
|