pysfi 0.1.5__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pysfi-0.1.5 → pysfi-0.1.6}/PKG-INFO +5 -1
- {pysfi-0.1.5 → pysfi-0.1.6}/pyproject.toml +132 -117
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/__init__.py +1 -1
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/bumpversion/__init__.py +1 -1
- pysfi-0.1.6/sfi/pdfsplit/pdfsplit.py +173 -0
- pysfi-0.1.6/sfi/pdfsplit/tests/test_pdfsplit.py +333 -0
- pysfi-0.1.6/sfi/pyloadergen/__init__.py +0 -0
- pysfi-0.1.6/sfi/pyloadergen/tests/__init__.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/.gitignore +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/README.md +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/alarmclock/__init__.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/alarmclock/alarmclock.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/bumpversion/README.md +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/bumpversion/bumpversion.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/bumpversion/tests/__init__.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/bumpversion/tests/test_bumpversion.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/embedinstall/embedinstall.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/filedate/README.md +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/filedate/__init__.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/filedate/filedate.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/makepython/__init__.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/makepython/makepython.py +0 -0
- {pysfi-0.1.5/sfi/pyloadergen → pysfi-0.1.6/sfi/pdfsplit}/tests/__init__.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/projectparse/projectparse.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/pyloadergen/pyloadergen.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/pyloadergen/tests/test_pyloadergen.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/pypacker/fspacker.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/taskkill/taskkill.py +0 -0
- {pysfi-0.1.5 → pysfi-0.1.6}/sfi/which/which.py +0 -0
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pysfi
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: Single File commands for Interactive python.
|
|
5
5
|
Requires-Python: >=3.8
|
|
6
6
|
Requires-Dist: tomli>=2.4.0; python_version < '3.11'
|
|
7
|
+
Provides-Extra: all
|
|
8
|
+
Requires-Dist: pymupdf>=1.24.11; extra == 'all'
|
|
9
|
+
Provides-Extra: office
|
|
10
|
+
Requires-Dist: pymupdf>=1.24.11; extra == 'office'
|
|
7
11
|
Description-Content-Type: text/markdown
|
|
8
12
|
|
|
9
13
|
# pysfi
|
|
@@ -1,117 +1,132 @@
|
|
|
1
|
-
[build-system]
|
|
2
|
-
build-backend = "hatchling.build"
|
|
3
|
-
requires = ["hatchling"]
|
|
4
|
-
|
|
5
|
-
[project]
|
|
6
|
-
dependencies = ["tomli>=2.4.0; python_version<'3.11'"]
|
|
7
|
-
description = "Single File commands for Interactive python."
|
|
8
|
-
name = "pysfi"
|
|
9
|
-
readme = "README.md"
|
|
10
|
-
requires-python = ">=3.8"
|
|
11
|
-
version = "0.1.
|
|
12
|
-
|
|
13
|
-
[project.scripts]
|
|
14
|
-
alarmclk = "sfi.alarmclock.alarmclock:main"
|
|
15
|
-
bumpversion = "sfi.bumpversion.bumpversion:main"
|
|
16
|
-
embedinstall = "sfi.embedinstall.embedinstall:main"
|
|
17
|
-
filedate = "sfi.filedate.filedate:main"
|
|
18
|
-
mkp = "sfi.makepython.makepython:main"
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
".
|
|
50
|
-
".
|
|
51
|
-
".
|
|
52
|
-
".
|
|
53
|
-
".
|
|
54
|
-
".
|
|
55
|
-
".
|
|
56
|
-
".
|
|
57
|
-
".
|
|
58
|
-
"
|
|
59
|
-
"
|
|
60
|
-
"
|
|
61
|
-
"
|
|
62
|
-
"
|
|
63
|
-
"
|
|
64
|
-
"
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
]
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
"
|
|
77
|
-
"
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
"
|
|
81
|
-
"
|
|
82
|
-
"
|
|
83
|
-
"
|
|
84
|
-
"
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
[
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
[
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
"sfi/
|
|
102
|
-
"sfi/
|
|
103
|
-
"sfi/
|
|
104
|
-
"sfi/
|
|
105
|
-
"sfi/
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
"
|
|
111
|
-
"
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
"
|
|
117
|
-
]
|
|
1
|
+
[build-system]
|
|
2
|
+
build-backend = "hatchling.build"
|
|
3
|
+
requires = ["hatchling"]
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
dependencies = ["tomli>=2.4.0; python_version<'3.11'"]
|
|
7
|
+
description = "Single File commands for Interactive python."
|
|
8
|
+
name = "pysfi"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
version = "0.1.6"
|
|
12
|
+
|
|
13
|
+
[project.scripts]
|
|
14
|
+
alarmclk = "sfi.alarmclock.alarmclock:main"
|
|
15
|
+
bumpversion = "sfi.bumpversion.bumpversion:main"
|
|
16
|
+
embedinstall = "sfi.embedinstall.embedinstall:main"
|
|
17
|
+
filedate = "sfi.filedate.filedate:main"
|
|
18
|
+
mkp = "sfi.makepython.makepython:main"
|
|
19
|
+
pdfsplit = "sfi.pdfsplit.pdfsplit:main"
|
|
20
|
+
projectparse = "sfi.projectparse.projectparse:main"
|
|
21
|
+
pyloadergen = "sfi.pyloadergen.pyloadergen:main"
|
|
22
|
+
pypacker = "sfi.pypacker.pypacker:main"
|
|
23
|
+
taskk = "sfi.taskkill.taskkill:main"
|
|
24
|
+
wch = "sfi.which.which:main"
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
all = ["pysfi[office]"]
|
|
28
|
+
office = ["pymupdf>=1.24.11"]
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.wheel]
|
|
31
|
+
exclude = [
|
|
32
|
+
"sfi/*/README.md",
|
|
33
|
+
"sfi/*/dist",
|
|
34
|
+
"sfi/*/pyproject.toml",
|
|
35
|
+
"sfi/*/tests",
|
|
36
|
+
]
|
|
37
|
+
packages = ["sfi"]
|
|
38
|
+
|
|
39
|
+
# Only include necessary source files to minimize package size
|
|
40
|
+
[tool.hatch.build]
|
|
41
|
+
include = ["README.md", "sfi/**/*.py", "sfi/pyproject.toml"]
|
|
42
|
+
|
|
43
|
+
[tool.ruff]
|
|
44
|
+
line-length = 120
|
|
45
|
+
target-version = "py38"
|
|
46
|
+
|
|
47
|
+
# Exclude files and directories
|
|
48
|
+
exclude = [
|
|
49
|
+
".bzr",
|
|
50
|
+
".direnv",
|
|
51
|
+
".eggs",
|
|
52
|
+
".git",
|
|
53
|
+
".git-rewrite",
|
|
54
|
+
".hg",
|
|
55
|
+
".mypy_cache",
|
|
56
|
+
".nox",
|
|
57
|
+
".pants.d",
|
|
58
|
+
".pytype",
|
|
59
|
+
".ruff_cache",
|
|
60
|
+
".svn",
|
|
61
|
+
".tox",
|
|
62
|
+
".venv",
|
|
63
|
+
"__pypackages__",
|
|
64
|
+
"_build",
|
|
65
|
+
"buck-out",
|
|
66
|
+
"build",
|
|
67
|
+
"dist",
|
|
68
|
+
"node_modules",
|
|
69
|
+
"venv",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
[tool.ruff.lint]
|
|
73
|
+
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
|
|
74
|
+
fixable = ["ALL"]
|
|
75
|
+
ignore = [
|
|
76
|
+
"B008", # Do not perform function calls in function arguments
|
|
77
|
+
"E501", # Line too long (handled by formatter)
|
|
78
|
+
]
|
|
79
|
+
select = [
|
|
80
|
+
"B", # flake8-bugbear
|
|
81
|
+
"C4", # flake8-comprehensions
|
|
82
|
+
"E", # pycodestyle errors
|
|
83
|
+
"F", # Pyflakes
|
|
84
|
+
"I", # isort
|
|
85
|
+
"N", # pep8-naming
|
|
86
|
+
"RUF", # Ruff-specific rules
|
|
87
|
+
"SIM", # flake8-simplify
|
|
88
|
+
"UP", # pyupgrade
|
|
89
|
+
"W", # pycodestyle warnings
|
|
90
|
+
]
|
|
91
|
+
unfixable = []
|
|
92
|
+
|
|
93
|
+
[tool.ruff.lint.isort]
|
|
94
|
+
known-first-party = ["sfi"]
|
|
95
|
+
|
|
96
|
+
[tool.ruff.lint.per-file-ignores]
|
|
97
|
+
"__init__.py" = ["F401"] # Allow unused imports
|
|
98
|
+
|
|
99
|
+
[tool.uv.workspace]
|
|
100
|
+
members = [
|
|
101
|
+
"sfi/alarmclock",
|
|
102
|
+
"sfi/bumpversion",
|
|
103
|
+
"sfi/embedinstall",
|
|
104
|
+
"sfi/filedate",
|
|
105
|
+
"sfi/makepython",
|
|
106
|
+
"sfi/pdfsplit",
|
|
107
|
+
"sfi/projectparse",
|
|
108
|
+
"sfi/pyloadergen",
|
|
109
|
+
"sfi/pypacker",
|
|
110
|
+
"sfi/taskkill",
|
|
111
|
+
"sfi/which",
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
[dependency-groups]
|
|
115
|
+
dev = [
|
|
116
|
+
"hatch>=1.14.2",
|
|
117
|
+
"pysfi[all]",
|
|
118
|
+
"pyside2>=5.15.2.1",
|
|
119
|
+
"pytest-cov>=5.0.0",
|
|
120
|
+
"pytest>=8.3.5",
|
|
121
|
+
"qdarkstyle>=3.2.3",
|
|
122
|
+
"ruff>=0.14.11",
|
|
123
|
+
"tomli>=2.4.0",
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
[tool.pytest.ini_options]
|
|
127
|
+
addopts = "-v --tb=short"
|
|
128
|
+
python_classes = ["Test*"]
|
|
129
|
+
python_files = ["test_*.py"]
|
|
130
|
+
python_functions = ["test_*"]
|
|
131
|
+
pythonpath = ["."]
|
|
132
|
+
testpaths = ["sfi"]
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import fitz
|
|
8
|
+
|
|
9
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
10
|
+
cwd = Path.cwd()
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def parse_page_ranges(range_str: str, total_pages: int) -> list[int]:
|
|
15
|
+
"""Parse page range string and return list of page numbers (1-indexed)."""
|
|
16
|
+
pages = []
|
|
17
|
+
for part in range_str.split(","):
|
|
18
|
+
part = part.strip()
|
|
19
|
+
if not part:
|
|
20
|
+
continue
|
|
21
|
+
if "-" in part:
|
|
22
|
+
start, end = part.split("-")
|
|
23
|
+
start = int(start) if start else 1
|
|
24
|
+
end = int(end) if end else total_pages
|
|
25
|
+
pages.extend(range(start, end + 1))
|
|
26
|
+
else:
|
|
27
|
+
pages.append(int(part))
|
|
28
|
+
return pages
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def split_by_number(input_file: Path, output_file: Path, number: int):
|
|
32
|
+
"""Split PDF into specified number of parts evenly."""
|
|
33
|
+
doc = fitz.open(input_file)
|
|
34
|
+
total_pages = doc.page_count
|
|
35
|
+
base_pages = total_pages // number
|
|
36
|
+
remainder = total_pages % number
|
|
37
|
+
|
|
38
|
+
logger.debug(
|
|
39
|
+
f"Total pages: {total_pages}, Splitting into {number} parts, {base_pages} base pages per part, {remainder} extra pages"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
current_page = 0
|
|
43
|
+
for i in range(number):
|
|
44
|
+
# First 'remainder' parts get one extra page
|
|
45
|
+
pages_in_this_part = base_pages + (1 if i < remainder else 0)
|
|
46
|
+
|
|
47
|
+
if current_page >= total_pages:
|
|
48
|
+
logger.debug(f"Skipping part {i + 1}: no more pages remaining")
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
end_page = min(current_page + pages_in_this_part, total_pages)
|
|
52
|
+
|
|
53
|
+
part_file = output_file.parent / f"{output_file.stem}_part{i + 1}{output_file.suffix}"
|
|
54
|
+
part_doc = fitz.open()
|
|
55
|
+
|
|
56
|
+
for page_num in range(current_page, end_page):
|
|
57
|
+
part_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
|
|
58
|
+
|
|
59
|
+
part_doc.save(part_file)
|
|
60
|
+
part_doc.close()
|
|
61
|
+
logger.info(f"Created part {i + 1}: {part_file} (pages {current_page + 1}-{end_page})")
|
|
62
|
+
|
|
63
|
+
current_page = end_page
|
|
64
|
+
|
|
65
|
+
doc.close()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def split_by_size(input_file: Path, output_file: Path, size: int):
|
|
69
|
+
"""Split PDF into parts with specified page size."""
|
|
70
|
+
doc = fitz.open(input_file)
|
|
71
|
+
total_pages = doc.page_count
|
|
72
|
+
|
|
73
|
+
logger.debug(f"Total pages: {total_pages}, Splitting with {size} pages per part")
|
|
74
|
+
|
|
75
|
+
part = 0
|
|
76
|
+
start_page = 0
|
|
77
|
+
|
|
78
|
+
while start_page < total_pages:
|
|
79
|
+
end_page = min(start_page + size, total_pages)
|
|
80
|
+
part_file = output_file.parent / f"{output_file.stem}_part{part + 1}{output_file.suffix}"
|
|
81
|
+
part_doc = fitz.open()
|
|
82
|
+
|
|
83
|
+
for page_num in range(start_page, end_page):
|
|
84
|
+
part_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
|
|
85
|
+
|
|
86
|
+
part_doc.save(part_file)
|
|
87
|
+
part_doc.close()
|
|
88
|
+
logger.info(f"Created part {part + 1}: {part_file} (pages {start_page + 1}-{end_page})")
|
|
89
|
+
|
|
90
|
+
start_page = end_page
|
|
91
|
+
part += 1
|
|
92
|
+
|
|
93
|
+
doc.close()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def split_by_range(input_file: Path, output_file: Path, range_str: str):
|
|
97
|
+
"""Extract specific pages from PDF based on range string."""
|
|
98
|
+
doc = fitz.open(input_file)
|
|
99
|
+
total_pages = doc.page_count
|
|
100
|
+
|
|
101
|
+
pages = parse_page_ranges(range_str, total_pages)
|
|
102
|
+
pages = [p - 1 for p in pages if 1 <= p <= total_pages] # Convert to 0-indexed
|
|
103
|
+
|
|
104
|
+
if not pages:
|
|
105
|
+
logger.error("No valid pages found in the specified range")
|
|
106
|
+
doc.close()
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
# Remove duplicates while preserving order
|
|
110
|
+
pages = sorted(set(pages))
|
|
111
|
+
|
|
112
|
+
logger.debug(f"Extracting pages: {[p + 1 for p in pages]}")
|
|
113
|
+
|
|
114
|
+
new_doc = fitz.open()
|
|
115
|
+
for page_num in pages:
|
|
116
|
+
new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
|
|
117
|
+
|
|
118
|
+
new_doc.save(output_file)
|
|
119
|
+
new_doc.close()
|
|
120
|
+
doc.close()
|
|
121
|
+
logger.info(f"Created output file: {output_file} ({len(pages)} pages)")
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def main():
|
|
125
|
+
parser = argparse.ArgumentParser(description="Split PDF files")
|
|
126
|
+
parser.add_argument("input", help="Input PDF file")
|
|
127
|
+
parser.add_argument("output", nargs="?", help="Output PDF file (optional for -n and -s modes)")
|
|
128
|
+
parser.add_argument("-o", "--output-dir", default=str(cwd), help="Output directory (default: current directory)")
|
|
129
|
+
parser.add_argument("-f", "--output-format", help="Output file format pattern, e.g., 'split_{part:02d}.pdf'")
|
|
130
|
+
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
|
131
|
+
|
|
132
|
+
# Split by number, size, or range
|
|
133
|
+
group = parser.add_mutually_exclusive_group(required=True)
|
|
134
|
+
group.add_argument("-n", "--number", type=int, help="Number of splits")
|
|
135
|
+
group.add_argument("-s", "--size", type=int, default=1, help="Size of each split in pages")
|
|
136
|
+
group.add_argument("-r", "--range", type=str, help="Range of pages to extract, e.g., '1,2,4-10,15-20,25-'")
|
|
137
|
+
|
|
138
|
+
args = parser.parse_args()
|
|
139
|
+
|
|
140
|
+
if args.verbose:
|
|
141
|
+
logger.setLevel(logging.DEBUG)
|
|
142
|
+
|
|
143
|
+
output_dir = Path(args.output_dir)
|
|
144
|
+
if not output_dir.is_dir():
|
|
145
|
+
logger.error(f"Output directory {args.output_dir} does not exist, please check the path.")
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
input_file = Path(args.input)
|
|
149
|
+
if not input_file.is_file():
|
|
150
|
+
logger.error(f"Input file {args.input} does not exist, please check the path.")
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
# For -n and -s modes, output is optional and defaults to base name with suffix
|
|
154
|
+
# For -r mode, output is required
|
|
155
|
+
if args.range and not args.output:
|
|
156
|
+
logger.error("Output file is required for -r/--range mode")
|
|
157
|
+
return
|
|
158
|
+
|
|
159
|
+
if not args.range:
|
|
160
|
+
output_file = output_dir / (input_file.stem + "_split.pdf") if not args.output else Path(args.output)
|
|
161
|
+
else:
|
|
162
|
+
output_file = Path(args.output)
|
|
163
|
+
|
|
164
|
+
logger.info(f"Start splitting {input_file}")
|
|
165
|
+
if args.number:
|
|
166
|
+
split_by_number(input_file, output_file, args.number)
|
|
167
|
+
elif args.size:
|
|
168
|
+
split_by_size(input_file, output_file, args.size)
|
|
169
|
+
elif args.range:
|
|
170
|
+
split_by_range(input_file, output_file, args.range)
|
|
171
|
+
else:
|
|
172
|
+
logger.error("Please specify either -n, -s, or -r")
|
|
173
|
+
return
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
"""Tests for PDF split functionality."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import shutil
|
|
5
|
+
import sys
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import fitz
|
|
10
|
+
import pytest
|
|
11
|
+
|
|
12
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
13
|
+
|
|
14
|
+
from sfi.pdfsplit.pdfsplit import parse_page_ranges, split_by_number, split_by_range, split_by_size
|
|
15
|
+
|
|
16
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@pytest.fixture
|
|
21
|
+
def temp_dir():
|
|
22
|
+
"""Create a temporary directory for test files."""
|
|
23
|
+
dir_path = Path(tempfile.mkdtemp())
|
|
24
|
+
yield dir_path
|
|
25
|
+
# Cleanup
|
|
26
|
+
shutil.rmtree(dir_path, ignore_errors=True)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@pytest.fixture
|
|
30
|
+
def sample_pdf_10(temp_dir):
|
|
31
|
+
"""Create a 10-page PDF for testing."""
|
|
32
|
+
pdf_path = temp_dir / "sample_10.pdf"
|
|
33
|
+
doc = fitz.open()
|
|
34
|
+
for i in range(10):
|
|
35
|
+
page = doc.new_page() # pyright: ignore[reportAttributeAccessIssue]
|
|
36
|
+
text = fitz.Point(50, 72)
|
|
37
|
+
page.insert_text(text, f"This is page {i + 1}", fontsize=12)
|
|
38
|
+
doc.save(pdf_path)
|
|
39
|
+
doc.close()
|
|
40
|
+
logger.debug(f"Created test PDF: {pdf_path} with 10 pages")
|
|
41
|
+
return pdf_path
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@pytest.fixture
|
|
45
|
+
def sample_pdf_5(temp_dir):
|
|
46
|
+
"""Create a 5-page PDF for testing."""
|
|
47
|
+
pdf_path = temp_dir / "sample_5.pdf"
|
|
48
|
+
doc = fitz.open()
|
|
49
|
+
for i in range(5):
|
|
50
|
+
page = doc.new_page() # pyright: ignore[reportAttributeAccessIssue]
|
|
51
|
+
text = fitz.Point(50, 72)
|
|
52
|
+
page.insert_text(text, f"This is page {i + 1}", fontsize=12)
|
|
53
|
+
doc.save(pdf_path)
|
|
54
|
+
doc.close()
|
|
55
|
+
logger.debug(f"Created test PDF: {pdf_path} with 5 pages")
|
|
56
|
+
return pdf_path
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_page_count(pdf_path: Path) -> int:
|
|
60
|
+
"""Get the number of pages in a PDF."""
|
|
61
|
+
doc = fitz.open(pdf_path)
|
|
62
|
+
count = doc.page_count
|
|
63
|
+
doc.close()
|
|
64
|
+
return count
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_parse_page_ranges():
|
|
68
|
+
"""Test page range parsing function."""
|
|
69
|
+
# Single pages
|
|
70
|
+
assert parse_page_ranges("1,2,3", 10) == [1, 2, 3]
|
|
71
|
+
|
|
72
|
+
# Range
|
|
73
|
+
assert parse_page_ranges("1-5", 10) == [1, 2, 3, 4, 5]
|
|
74
|
+
|
|
75
|
+
# Mixed
|
|
76
|
+
assert parse_page_ranges("1,3,5-7", 10) == [1, 3, 5, 6, 7]
|
|
77
|
+
|
|
78
|
+
# Open end range
|
|
79
|
+
assert parse_page_ranges("5-", 10) == [5, 6, 7, 8, 9, 10]
|
|
80
|
+
|
|
81
|
+
# Open start range
|
|
82
|
+
assert parse_page_ranges("-5", 10) == [1, 2, 3, 4, 5]
|
|
83
|
+
|
|
84
|
+
# Out of range pages are handled
|
|
85
|
+
assert parse_page_ranges("1,2,20", 10) == [1, 2, 20]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_split_by_number_3_parts(sample_pdf_10, temp_dir):
|
|
89
|
+
"""Test splitting 10-page PDF into 3 parts."""
|
|
90
|
+
output_file = temp_dir / "output.pdf"
|
|
91
|
+
split_by_number(sample_pdf_10, output_file, 3)
|
|
92
|
+
|
|
93
|
+
# Check that 3 parts were created
|
|
94
|
+
parts = sorted(temp_dir.glob("output_part*.pdf"))
|
|
95
|
+
assert len(parts) == 3
|
|
96
|
+
|
|
97
|
+
# Check page counts: 10 pages split into 3 parts should be 4, 3, 3 pages
|
|
98
|
+
page_counts = [get_page_count(p) for p in parts]
|
|
99
|
+
assert page_counts == [4, 3, 3]
|
|
100
|
+
|
|
101
|
+
logger.info(f"Split into 3 parts: {page_counts}")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_split_by_number_equal(sample_pdf_10, temp_dir):
|
|
105
|
+
"""Test splitting 10-page PDF into 5 parts."""
|
|
106
|
+
output_file = temp_dir / "output.pdf"
|
|
107
|
+
split_by_number(sample_pdf_10, output_file, 5)
|
|
108
|
+
|
|
109
|
+
parts = sorted(temp_dir.glob("output_part*.pdf"))
|
|
110
|
+
assert len(parts) == 5
|
|
111
|
+
|
|
112
|
+
# Each part should have 2 pages
|
|
113
|
+
page_counts = [get_page_count(p) for p in parts]
|
|
114
|
+
assert all(count == 2 for count in page_counts)
|
|
115
|
+
|
|
116
|
+
logger.info(f"Split into 5 equal parts: {page_counts}")
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def test_split_by_number_more_parts_than_pages(sample_pdf_5, temp_dir):
|
|
120
|
+
"""Test splitting 5-page PDF into 10 parts."""
|
|
121
|
+
output_file = temp_dir / "output.pdf"
|
|
122
|
+
split_by_number(sample_pdf_5, output_file, 10)
|
|
123
|
+
|
|
124
|
+
parts = sorted(temp_dir.glob("output_part*.pdf"))
|
|
125
|
+
assert len(parts) == 5
|
|
126
|
+
|
|
127
|
+
# Each part should have 1 page
|
|
128
|
+
page_counts = [get_page_count(p) for p in parts]
|
|
129
|
+
assert all(count == 1 for count in page_counts)
|
|
130
|
+
|
|
131
|
+
logger.info(f"Split 5 pages into 10 parts: {page_counts}")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_split_by_size_3_pages(sample_pdf_10, temp_dir):
|
|
135
|
+
"""Test splitting PDF with 3 pages per part."""
|
|
136
|
+
output_file = temp_dir / "output.pdf"
|
|
137
|
+
split_by_size(sample_pdf_10, output_file, 3)
|
|
138
|
+
|
|
139
|
+
parts = sorted(temp_dir.glob("output_part*.pdf"))
|
|
140
|
+
assert len(parts) == 4
|
|
141
|
+
|
|
142
|
+
# Page counts should be 3, 3, 3, 1
|
|
143
|
+
page_counts = [get_page_count(p) for p in parts]
|
|
144
|
+
assert page_counts == [3, 3, 3, 1]
|
|
145
|
+
|
|
146
|
+
logger.info(f"Split by size 3: {page_counts}")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_split_by_size_1_page(sample_pdf_10, temp_dir):
|
|
150
|
+
"""Test splitting PDF with 1 page per part."""
|
|
151
|
+
output_file = temp_dir / "output.pdf"
|
|
152
|
+
split_by_size(sample_pdf_10, output_file, 1)
|
|
153
|
+
|
|
154
|
+
parts = sorted(temp_dir.glob("output_part*.pdf"))
|
|
155
|
+
assert len(parts) == 10
|
|
156
|
+
|
|
157
|
+
# Each part should have 1 page
|
|
158
|
+
page_counts = [get_page_count(p) for p in parts]
|
|
159
|
+
assert all(count == 1 for count in page_counts)
|
|
160
|
+
|
|
161
|
+
logger.info(f"Split by size 1: {page_counts}")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def test_split_by_size_exact_division(sample_pdf_10, temp_dir):
|
|
165
|
+
"""Test splitting PDF with exact division."""
|
|
166
|
+
output_file = temp_dir / "output.pdf"
|
|
167
|
+
split_by_size(sample_pdf_10, output_file, 5)
|
|
168
|
+
|
|
169
|
+
parts = sorted(temp_dir.glob("output_part*.pdf"))
|
|
170
|
+
assert len(parts) == 2
|
|
171
|
+
|
|
172
|
+
# Each part should have 5 pages
|
|
173
|
+
page_counts = [get_page_count(p) for p in parts]
|
|
174
|
+
assert page_counts == [5, 5]
|
|
175
|
+
|
|
176
|
+
logger.info(f"Split by size 5 (exact): {page_counts}")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def test_split_by_range_single_pages(sample_pdf_10, temp_dir):
|
|
180
|
+
"""Test extracting single pages."""
|
|
181
|
+
output_file = temp_dir / "extracted.pdf"
|
|
182
|
+
split_by_range(sample_pdf_10, output_file, "1,3,5,7,9")
|
|
183
|
+
|
|
184
|
+
assert output_file.exists()
|
|
185
|
+
assert get_page_count(output_file) == 5
|
|
186
|
+
|
|
187
|
+
# Verify the content (optional, just checking page count)
|
|
188
|
+
logger.info("Extracted 5 pages: 1,3,5,7,9")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def test_split_by_range_with_dash(sample_pdf_10, temp_dir):
|
|
192
|
+
"""Test extracting page ranges."""
|
|
193
|
+
output_file = temp_dir / "extracted.pdf"
|
|
194
|
+
split_by_range(sample_pdf_10, output_file, "1-3,5-7,9-10")
|
|
195
|
+
|
|
196
|
+
assert output_file.exists()
|
|
197
|
+
assert get_page_count(output_file) == 8 # Pages 1,2,3,5,6,7,9,10
|
|
198
|
+
|
|
199
|
+
logger.info("Extracted 8 pages with ranges: 1-3,5-7,9-10")
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def test_split_by_range_open_end(sample_pdf_10, temp_dir):
|
|
203
|
+
"""Test extracting from a page to the end."""
|
|
204
|
+
output_file = temp_dir / "extracted.pdf"
|
|
205
|
+
split_by_range(sample_pdf_10, output_file, "5-")
|
|
206
|
+
|
|
207
|
+
assert output_file.exists()
|
|
208
|
+
assert get_page_count(output_file) == 6 # Pages 5,6,7,8,9,10
|
|
209
|
+
|
|
210
|
+
logger.info("Extracted pages 5- (6 pages)")
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def test_split_by_range_open_start(sample_pdf_10, temp_dir):
|
|
214
|
+
"""Test extracting from start to a page."""
|
|
215
|
+
output_file = temp_dir / "extracted.pdf"
|
|
216
|
+
split_by_range(sample_pdf_10, output_file, "-3")
|
|
217
|
+
|
|
218
|
+
assert output_file.exists()
|
|
219
|
+
assert get_page_count(output_file) == 3 # Pages 1,2,3
|
|
220
|
+
|
|
221
|
+
logger.info("Extracted pages -3 (3 pages)")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def test_split_by_range_complex(sample_pdf_10, temp_dir):
|
|
225
|
+
"""Test complex page range."""
|
|
226
|
+
output_file = temp_dir / "extracted.pdf"
|
|
227
|
+
split_by_range(sample_pdf_10, output_file, "1,2,4-6,8-")
|
|
228
|
+
|
|
229
|
+
assert output_file.exists()
|
|
230
|
+
assert get_page_count(output_file) == 8 # Pages 1,2,4,5,6,8,9,10
|
|
231
|
+
|
|
232
|
+
logger.info("Extracted complex range: 1,2,4-6,8-")
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def test_split_by_range_out_of_range(sample_pdf_5, temp_dir):
|
|
236
|
+
"""Test extracting pages beyond the PDF length."""
|
|
237
|
+
output_file = temp_dir / "extracted.pdf"
|
|
238
|
+
split_by_range(sample_pdf_5, output_file, "1,3,10,20")
|
|
239
|
+
|
|
240
|
+
assert output_file.exists()
|
|
241
|
+
assert get_page_count(output_file) == 2 # Only pages 1 and 3 exist
|
|
242
|
+
|
|
243
|
+
logger.info("Extracted pages with some out of range: 1,3,10,20")
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def test_split_by_range_all_pages(sample_pdf_10, temp_dir):
|
|
247
|
+
"""Test extracting all pages."""
|
|
248
|
+
output_file = temp_dir / "extracted.pdf"
|
|
249
|
+
split_by_range(sample_pdf_10, output_file, "1-10")
|
|
250
|
+
|
|
251
|
+
assert output_file.exists()
|
|
252
|
+
assert get_page_count(output_file) == 10
|
|
253
|
+
|
|
254
|
+
logger.info("Extracted all 10 pages")
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def test_split_by_range_duplicate_pages(sample_pdf_10, temp_dir):
|
|
258
|
+
"""Test extracting with duplicate page specifications."""
|
|
259
|
+
output_file = temp_dir / "extracted.pdf"
|
|
260
|
+
split_by_range(sample_pdf_10, output_file, "1,2,1,2,3-5,3-5")
|
|
261
|
+
|
|
262
|
+
assert output_file.exists()
|
|
263
|
+
# Should deduplicate: 1,2,3,4,5
|
|
264
|
+
assert get_page_count(output_file) == 5
|
|
265
|
+
|
|
266
|
+
logger.info("Extracted with duplicates (should deduplicate): 1,2,1,2,3-5,3-5")
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def test_output_file_naming(sample_pdf_10, temp_dir):
|
|
270
|
+
"""Test that output files are named correctly."""
|
|
271
|
+
output_file = temp_dir / "test_output.pdf"
|
|
272
|
+
split_by_number(sample_pdf_10, output_file, 3)
|
|
273
|
+
|
|
274
|
+
# Check that parts are named correctly
|
|
275
|
+
parts = sorted(temp_dir.glob("test_output_part*.pdf"))
|
|
276
|
+
assert len(parts) == 3
|
|
277
|
+
|
|
278
|
+
# Check file names
|
|
279
|
+
filenames = [p.name for p in parts]
|
|
280
|
+
assert filenames == ["test_output_part1.pdf", "test_output_part2.pdf", "test_output_part3.pdf"]
|
|
281
|
+
|
|
282
|
+
logger.info(f"File naming test passed: {filenames}")
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def test_large_pdf(temp_dir):
|
|
286
|
+
"""Test with a larger PDF (50 pages)."""
|
|
287
|
+
# Create 50-page PDF
|
|
288
|
+
pdf_path = temp_dir / "large_50.pdf"
|
|
289
|
+
doc = fitz.open()
|
|
290
|
+
for i in range(50):
|
|
291
|
+
page = doc.new_page() # pyright: ignore[reportAttributeAccessIssue]
|
|
292
|
+
text = fitz.Point(50, 72)
|
|
293
|
+
page.insert_text(text, f"This is page {i + 1}", fontsize=12)
|
|
294
|
+
doc.save(pdf_path)
|
|
295
|
+
doc.close()
|
|
296
|
+
|
|
297
|
+
# Test split by size (10 pages each)
|
|
298
|
+
output_file = temp_dir / "large_output.pdf"
|
|
299
|
+
split_by_size(pdf_path, output_file, 10)
|
|
300
|
+
|
|
301
|
+
parts = sorted(temp_dir.glob("large_output_part*.pdf"))
|
|
302
|
+
assert len(parts) == 5
|
|
303
|
+
|
|
304
|
+
# Each part should have 10 pages
|
|
305
|
+
page_counts = [get_page_count(p) for p in parts]
|
|
306
|
+
assert all(count == 10 for count in page_counts)
|
|
307
|
+
|
|
308
|
+
logger.info("Large PDF (50 pages) split into 5 parts of 10 pages each")
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def test_edge_case_single_page_pdf(temp_dir):
|
|
312
|
+
"""Test with a single-page PDF."""
|
|
313
|
+
pdf_path = temp_dir / "single.pdf"
|
|
314
|
+
doc = fitz.open()
|
|
315
|
+
page = doc.new_page() # pyright: ignore[reportAttributeAccessIssue]
|
|
316
|
+
text = fitz.Point(50, 72)
|
|
317
|
+
page.insert_text(text, "Single page", fontsize=12)
|
|
318
|
+
doc.save(pdf_path)
|
|
319
|
+
doc.close()
|
|
320
|
+
|
|
321
|
+
# Split by number (should work)
|
|
322
|
+
output_file = temp_dir / "single_output.pdf"
|
|
323
|
+
split_by_number(pdf_path, output_file, 2)
|
|
324
|
+
|
|
325
|
+
parts = sorted(temp_dir.glob("single_output_part*.pdf"))
|
|
326
|
+
assert len(parts) == 1
|
|
327
|
+
assert get_page_count(parts[0]) == 1
|
|
328
|
+
|
|
329
|
+
logger.info("Single-page PDF handled correctly")
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
if __name__ == "__main__":
|
|
333
|
+
pytest.main([__file__, "-v", "-s"])
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|