pdf-scount 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdf-scount
3
+ Version: 0.1.0
4
+ Summary: A CLI tool to search keywords in PDFs and export matching pages as PNG images
5
+ Author-email: pioet <1599023541@qq.com>
6
+ License-Expression: MIT
7
+ Keywords: cli,extract,pdf,png,search
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: End Users/Desktop
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Topic :: Utilities
15
+ Requires-Python: >=3.13
16
+ Requires-Dist: pymupdf>=1.26.7
17
+ Requires-Dist: typer>=0.21.1
@@ -0,0 +1,5 @@
1
+ pdf_scount.py,sha256=ntUX9D-azjQiCKhEu8jYHdRMHpmaq6R32r-Lci_WBp4,2613
2
+ pdf_scount-0.1.0.dist-info/METADATA,sha256=RZILGhoAJLz4nQmsf8L9FCi42d_s1z6QOevZaZdSkIQ,637
3
+ pdf_scount-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
4
+ pdf_scount-0.1.0.dist-info/entry_points.txt,sha256=z_hdqmQ6c_G4DWwWvsd-7VJCXdMpx0XWlgspMjWYjW8,44
5
+ pdf_scount-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pdfscout = pdf_scount:app
pdf_scount.py ADDED
@@ -0,0 +1,74 @@
1
+ import fitz # PyMuPDF
2
+ import typer
3
+ from typing import Optional
4
+ from pathlib import Path
5
+
6
+ app = typer.Typer(help="PdfScout: 一个根据关键词快速定位并截取 PDF 页面的工具。")
7
+
8
+ def find_first_page_with_keyword(pdf_path: Path, keyword: str) -> Optional[int]:
9
+ """
10
+ 在 PDF 中搜索关键词,返回第一个匹配页面的索引(从0开始)。
11
+ """
12
+ doc = fitz.open(pdf_path)
13
+ search_term = keyword.lower()
14
+
15
+ for page_num in range(len(doc)):
16
+ page = doc.load_page(page_num)
17
+ # 获取纯文本并转为小写进行不区分大小写匹配
18
+ if search_term in page.get_text().lower():
19
+ doc.close()
20
+ return page_num
21
+
22
+ doc.close()
23
+ return None
24
+
25
+ def save_page_as_png(pdf_path: Path, page_index: int, output_path: Path, dpi: int):
26
+ """
27
+ 将指定页码转换为 PNG 图片。
28
+ """
29
+ doc = fitz.open(pdf_path)
30
+ page = doc.load_page(page_index)
31
+
32
+ # 计算缩放比例。PyMuPDF 默认 DPI 是 72。
33
+ # 缩放因子 = 目标 DPI / 72
34
+ zoom = dpi / 72
35
+ matrix = fitz.Matrix(zoom, zoom)
36
+
37
+ pix = page.get_pixmap(matrix=matrix)
38
+ pix.save(str(output_path))
39
+ doc.close()
40
+
41
+ @app.command()
42
+ def scout(
43
+ pdf_path: Path = typer.Argument(..., help="输入的 PDF 文件路径", exists=True, file_okay=True, dir_okay=False, readable=True),
44
+ keyword: str = typer.Argument(..., help="要搜索的关键词"),
45
+ output: Optional[Path] = typer.Option(None, "--output", "-o", help="输出图片路径。如果不指定,则使用默认命名规则"),
46
+ dpi: int = typer.Option(300, "--dpi", "-d", help="输出图片的 DPI 分辨率")
47
+ ):
48
+ """
49
+ 执行侦察任务:搜索关键词并导出页面为 PNG。
50
+ """
51
+ # 1. 执行搜索
52
+ typer.echo(f"🔍 正在 '{pdf_path.name}' 中寻找关键词: '{keyword}'...")
53
+ page_idx = find_first_page_with_keyword(pdf_path, keyword)
54
+
55
+ if page_idx is None:
56
+ typer.secho(f"❌ 未在文档中找到关键词: '{keyword}'", fg="red")
57
+ raise typer.Exit()
58
+
59
+ # 2. 确定输出文件名
60
+ if output is None:
61
+ # 默认命名: 原文件名_page_N.png
62
+ output_name = f"{pdf_path.stem}_page_{page_idx + 1}.png"
63
+ output = Path.cwd() / output_name
64
+
65
+ # 3. 转换并存储
66
+ try:
67
+ save_page_as_png(pdf_path, page_idx, output, dpi)
68
+ typer.secho(f"✅ 成功!页面 {page_idx + 1} 已保存至: {output}", fg="green")
69
+ except Exception as e:
70
+ typer.secho(f"💥 转换过程中出错: {e}", fg="red")
71
+ raise typer.Exit(code=1)
72
+
73
+ if __name__ == "__main__":
74
+ app()