pysfi 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pysfi-0.1.12.dist-info → pysfi-0.1.14.dist-info}/METADATA +1 -1
- pysfi-0.1.14.dist-info/RECORD +68 -0
- {pysfi-0.1.12.dist-info → pysfi-0.1.14.dist-info}/entry_points.txt +3 -0
- sfi/__init__.py +19 -2
- sfi/alarmclock/__init__.py +3 -0
- sfi/alarmclock/alarmclock.py +23 -40
- sfi/bumpversion/__init__.py +3 -1
- sfi/bumpversion/bumpversion.py +64 -15
- sfi/cleanbuild/__init__.py +3 -0
- sfi/cleanbuild/cleanbuild.py +5 -1
- sfi/cli.py +25 -4
- sfi/condasetup/__init__.py +1 -0
- sfi/condasetup/condasetup.py +91 -76
- sfi/docdiff/__init__.py +1 -0
- sfi/docdiff/docdiff.py +3 -2
- sfi/docscan/__init__.py +1 -1
- sfi/docscan/docscan.py +78 -23
- sfi/docscan/docscan_gui.py +152 -48
- sfi/filedate/filedate.py +12 -5
- sfi/img2pdf/img2pdf.py +453 -0
- sfi/llmclient/llmclient.py +31 -8
- sfi/llmquantize/llmquantize.py +76 -37
- sfi/llmserver/__init__.py +1 -0
- sfi/llmserver/llmserver.py +63 -13
- sfi/makepython/makepython.py +1145 -201
- sfi/pdfsplit/pdfsplit.py +45 -12
- sfi/pyarchive/__init__.py +1 -0
- sfi/pyarchive/pyarchive.py +908 -278
- sfi/pyembedinstall/pyembedinstall.py +88 -89
- sfi/pylibpack/pylibpack.py +561 -463
- sfi/pyloadergen/pyloadergen.py +372 -218
- sfi/pypack/pypack.py +510 -959
- sfi/pyprojectparse/pyprojectparse.py +337 -40
- sfi/pysourcepack/__init__.py +1 -0
- sfi/pysourcepack/pysourcepack.py +210 -131
- sfi/quizbase/quizbase_gui.py +2 -2
- sfi/taskkill/taskkill.py +168 -59
- sfi/which/which.py +11 -3
- pysfi-0.1.12.dist-info/RECORD +0 -62
- sfi/workflowengine/workflowengine.py +0 -444
- {pysfi-0.1.12.dist-info → pysfi-0.1.14.dist-info}/WHEEL +0 -0
- /sfi/{workflowengine → img2pdf}/__init__.py +0 -0
sfi/condasetup/condasetup.py
CHANGED
|
@@ -3,92 +3,106 @@ from __future__ import annotations
|
|
|
3
3
|
import argparse
|
|
4
4
|
import logging
|
|
5
5
|
import os
|
|
6
|
+
import subprocess
|
|
6
7
|
from pathlib import Path
|
|
8
|
+
from typing import Final
|
|
7
9
|
|
|
8
10
|
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
9
11
|
logger = logging.getLogger(__name__)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
_CONDA_MIRROR_URLS: dict[str, frozenset[str]] = {
|
|
13
|
-
"tsinghua": frozenset(
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
"https://mirrors.aliyun.com/anaconda/pkgs/pro/",
|
|
61
|
-
"https://mirrors.aliyun.com/anaconda/pkgs/dev/",
|
|
62
|
-
"https://mirrors.aliyun.com/anaconda/cloud/conda-forge/",
|
|
63
|
-
"https://mirrors.aliyun.com/anaconda/cloud/bioconda/",
|
|
64
|
-
"https://mirrors.aliyun.com/anaconda/cloud/menpo/",
|
|
65
|
-
"https://mirrors.aliyun.com/anaconda/cloud/pytorch/",
|
|
66
|
-
]
|
|
67
|
-
),
|
|
12
|
+
|
|
13
|
+
# Conda mirror URLs
|
|
14
|
+
_CONDA_MIRROR_URLS: Final[dict[str, frozenset[str]]] = {
|
|
15
|
+
"tsinghua": frozenset([
|
|
16
|
+
"https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/",
|
|
17
|
+
"https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/",
|
|
18
|
+
"https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r/",
|
|
19
|
+
"https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2/",
|
|
20
|
+
"https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/pro/",
|
|
21
|
+
"https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/",
|
|
22
|
+
"https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/bioconda/",
|
|
23
|
+
"https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/menpo/",
|
|
24
|
+
"https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/",
|
|
25
|
+
]),
|
|
26
|
+
"ustc": frozenset([
|
|
27
|
+
"https://mirrors.ustc.edu.cn/anaconda/pkgs/main/",
|
|
28
|
+
"https://mirrors.ustc.edu.cn/anaconda/pkgs/free/",
|
|
29
|
+
"https://mirrors.ustc.edu.cn/anaconda/pkgs/r/",
|
|
30
|
+
"https://mirrors.ustc.edu.cn/anaconda/pkgs/msys2/",
|
|
31
|
+
"https://mirrors.ustc.edu.cn/anaconda/pkgs/pro/",
|
|
32
|
+
"https://mirrors.ustc.edu.cn/anaconda/pkgs/dev/",
|
|
33
|
+
"https://mirrors.ustc.edu.cn/anaconda/cloud/conda-forge/",
|
|
34
|
+
"https://mirrors.ustc.edu.cn/anaconda/cloud/bioconda/",
|
|
35
|
+
"https://mirrors.ustc.edu.cn/anaconda/cloud/menpo/",
|
|
36
|
+
"https://mirrors.ustc.edu.cn/anaconda/cloud/pytorch/",
|
|
37
|
+
]),
|
|
38
|
+
"bsfu": frozenset([
|
|
39
|
+
"https://mirrors.bsfu.edu.cn/anaconda/pkgs/main/",
|
|
40
|
+
"https://mirrors.bsfu.edu.cn/anaconda/pkgs/free/",
|
|
41
|
+
"https://mirrors.bsfu.edu.cn/anaconda/pkgs/r/",
|
|
42
|
+
"https://mirrors.bsfu.edu.cn/anaconda/pkgs/msys2/",
|
|
43
|
+
"https://mirrors.bsfu.edu.cn/anaconda/pkgs/pro/",
|
|
44
|
+
"https://mirrors.bsfu.edu.cn/anaconda/pkgs/dev/",
|
|
45
|
+
"https://mirrors.bsfu.edu.cn/anaconda/cloud/conda-forge/",
|
|
46
|
+
"https://mirrors.bsfu.edu.cn/anaconda/cloud/bioconda/",
|
|
47
|
+
"https://mirrors.bsfu.edu.cn/anaconda/cloud/menpo/",
|
|
48
|
+
"https://mirrors.bsfu.edu.cn/anaconda/cloud/pytorch/",
|
|
49
|
+
]),
|
|
50
|
+
"aliyun": frozenset([
|
|
51
|
+
"https://mirrors.aliyun.com/anaconda/pkgs/main/",
|
|
52
|
+
"https://mirrors.aliyun.com/anaconda/pkgs/free/",
|
|
53
|
+
"https://mirrors.aliyun.com/anaconda/pkgs/r/",
|
|
54
|
+
"https://mirrors.aliyun.com/anaconda/pkgs/msys2/",
|
|
55
|
+
"https://mirrors.aliyun.com/anaconda/pkgs/pro/",
|
|
56
|
+
"https://mirrors.aliyun.com/anaconda/pkgs/dev/",
|
|
57
|
+
"https://mirrors.aliyun.com/anaconda/cloud/conda-forge/",
|
|
58
|
+
"https://mirrors.aliyun.com/anaconda/cloud/bioconda/",
|
|
59
|
+
"https://mirrors.aliyun.com/anaconda/cloud/menpo/",
|
|
60
|
+
"https://mirrors.aliyun.com/anaconda/cloud/pytorch/",
|
|
61
|
+
]),
|
|
68
62
|
}
|
|
69
63
|
|
|
70
64
|
|
|
71
65
|
def set_conda_mirror(mirror: str = "tsinghua") -> None:
|
|
72
|
-
"""Set the Conda mirror for the given channel.
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
else:
|
|
79
|
-
logger.debug("No existing .condarc file found")
|
|
80
|
-
|
|
81
|
-
mirror_urls = _CONDA_MIRROR_URLS[mirror]
|
|
82
|
-
for url in mirror_urls:
|
|
83
|
-
logger.debug(f"Adding mirror: {url}")
|
|
84
|
-
os.system(f"conda config --add channels {url}")
|
|
85
|
-
os.system("conda config --set show_channel_urls yes")
|
|
86
|
-
logger.info("Conda mirror set successfully")
|
|
87
|
-
else:
|
|
66
|
+
"""Set the Conda mirror for the given channel.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
mirror: Mirror name (tsinghua, ustc, bsfu, or aliyun)
|
|
70
|
+
"""
|
|
71
|
+
if mirror not in _CONDA_MIRROR_URLS:
|
|
88
72
|
logger.error(f"Invalid mirror: {mirror}")
|
|
73
|
+
return
|
|
74
|
+
|
|
75
|
+
old_config = Path.home() / ".condarc"
|
|
76
|
+
if old_config.exists():
|
|
77
|
+
logger.info("Found existing .condarc file, backing it up")
|
|
78
|
+
os.rename(old_config, Path.home() / ".condarc.bak")
|
|
79
|
+
else:
|
|
80
|
+
logger.debug("No existing .condarc file found")
|
|
81
|
+
|
|
82
|
+
mirror_urls = _CONDA_MIRROR_URLS[mirror]
|
|
83
|
+
for url in mirror_urls:
|
|
84
|
+
logger.debug(f"Adding mirror: {url}")
|
|
85
|
+
try:
|
|
86
|
+
subprocess.run(["conda", "config", "--add", "channels", url], check=True)
|
|
87
|
+
except subprocess.CalledProcessError as e:
|
|
88
|
+
logger.error(f"Failed to add mirror {url}: {e}")
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
subprocess.run(
|
|
93
|
+
["conda", "config", "--set", "show_channel_urls", "yes"], check=True
|
|
94
|
+
)
|
|
95
|
+
logger.info("Conda mirror set successfully")
|
|
96
|
+
except subprocess.CalledProcessError as e:
|
|
97
|
+
logger.error(f"Failed to set show_channel_urls: {e}")
|
|
98
|
+
|
|
89
99
|
|
|
100
|
+
def parse_args() -> argparse.Namespace:
|
|
101
|
+
"""Parse command line arguments.
|
|
90
102
|
|
|
91
|
-
|
|
103
|
+
Returns:
|
|
104
|
+
Parsed arguments
|
|
105
|
+
"""
|
|
92
106
|
parser = argparse.ArgumentParser(description="Setup Conda environment for SFI")
|
|
93
107
|
parser.add_argument(
|
|
94
108
|
"mirror",
|
|
@@ -103,7 +117,8 @@ def parse_args():
|
|
|
103
117
|
return parser.parse_args()
|
|
104
118
|
|
|
105
119
|
|
|
106
|
-
def main():
|
|
120
|
+
def main() -> None:
|
|
121
|
+
"""Main entry point for condasetup CLI."""
|
|
107
122
|
args = parse_args()
|
|
108
123
|
|
|
109
124
|
if args.debug:
|
sfi/docdiff/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
sfi/docdiff/docdiff.py
CHANGED
|
@@ -10,9 +10,10 @@ import time
|
|
|
10
10
|
from dataclasses import dataclass
|
|
11
11
|
from functools import cached_property
|
|
12
12
|
from pathlib import Path
|
|
13
|
-
from typing import Any
|
|
13
|
+
from typing import Any, Final
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
# Configuration file path
|
|
16
|
+
CONFIG_FILE: Final[Path] = Path.home() / ".pysfi" / "docdiff.json"
|
|
16
17
|
|
|
17
18
|
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
18
19
|
logger = logging.getLogger(__name__)
|
sfi/docscan/__init__.py
CHANGED
sfi/docscan/docscan.py
CHANGED
|
@@ -91,7 +91,9 @@ def t(key: str, **kwargs) -> str:
|
|
|
91
91
|
Returns:
|
|
92
92
|
Translated text
|
|
93
93
|
"""
|
|
94
|
-
text =
|
|
94
|
+
text = (
|
|
95
|
+
ZH_TRANSLATIONS.get(key, key) if USE_CHINESE else EN_TRANSLATIONS.get(key, key)
|
|
96
|
+
)
|
|
95
97
|
|
|
96
98
|
# Format with kwargs if provided
|
|
97
99
|
if kwargs:
|
|
@@ -123,7 +125,9 @@ class Rule:
|
|
|
123
125
|
# Use re.ASCII for faster matching when possible
|
|
124
126
|
self.compiled_pattern = re.compile(self.pattern, flags | re.ASCII)
|
|
125
127
|
except re.error as e:
|
|
126
|
-
logger.warning(
|
|
128
|
+
logger.warning(
|
|
129
|
+
t("invalid_regex_pattern", pattern=self.pattern, error=e)
|
|
130
|
+
)
|
|
127
131
|
self.compiled_pattern = None
|
|
128
132
|
else:
|
|
129
133
|
self.compiled_pattern = None
|
|
@@ -274,13 +278,18 @@ class DocumentScanner:
|
|
|
274
278
|
"use_pdf_ocr": self.use_pdf_ocr,
|
|
275
279
|
"use_process_pool": self.use_process_pool,
|
|
276
280
|
},
|
|
277
|
-
"rules": [
|
|
281
|
+
"rules": [
|
|
282
|
+
{"name": r.name, "pattern": r.pattern, "is_regex": r.is_regex}
|
|
283
|
+
for r in self.rules
|
|
284
|
+
],
|
|
278
285
|
"matches": [],
|
|
279
286
|
}
|
|
280
287
|
|
|
281
288
|
# Scan files in parallel
|
|
282
289
|
processed = 0
|
|
283
|
-
executor_class =
|
|
290
|
+
executor_class = (
|
|
291
|
+
ProcessPoolExecutor if self.use_process_pool else ThreadPoolExecutor
|
|
292
|
+
)
|
|
284
293
|
executor = executor_class(max_workers=threads)
|
|
285
294
|
self._executor = executor # Keep reference for forced shutdown
|
|
286
295
|
|
|
@@ -350,10 +359,17 @@ class DocumentScanner:
|
|
|
350
359
|
break
|
|
351
360
|
|
|
352
361
|
try:
|
|
353
|
-
file_result = future.result(
|
|
362
|
+
file_result = future.result(
|
|
363
|
+
timeout=1.0
|
|
364
|
+
) # Short timeout to allow quick stop
|
|
354
365
|
if file_result and file_result["matches"]:
|
|
355
366
|
results["matches"].append(file_result)
|
|
356
|
-
logger.info(
|
|
367
|
+
logger.info(
|
|
368
|
+
t(
|
|
369
|
+
"found_matches_in_file",
|
|
370
|
+
file_name=Path(file_result.get("file_path", "")).name,
|
|
371
|
+
)
|
|
372
|
+
)
|
|
357
373
|
except TimeoutError:
|
|
358
374
|
logger.warning(t("task_timeout_scan_may_be_stopping"))
|
|
359
375
|
if self.stopped:
|
|
@@ -366,7 +382,9 @@ class DocumentScanner:
|
|
|
366
382
|
|
|
367
383
|
# Report progress
|
|
368
384
|
if show_progress and processed % 10 == 0:
|
|
369
|
-
logger.info(
|
|
385
|
+
logger.info(
|
|
386
|
+
t("progress_report", processed=processed, total=len(files))
|
|
387
|
+
)
|
|
370
388
|
|
|
371
389
|
# Call progress callback if set
|
|
372
390
|
if self._progress_callback:
|
|
@@ -391,7 +409,9 @@ class DocumentScanner:
|
|
|
391
409
|
if self.stopped:
|
|
392
410
|
logger.info(t("scan_stopped_processed_files", processed=processed))
|
|
393
411
|
else:
|
|
394
|
-
logger.info(
|
|
412
|
+
logger.info(
|
|
413
|
+
t("scan_complete_found_matches", matches_count=len(results["matches"]))
|
|
414
|
+
)
|
|
395
415
|
|
|
396
416
|
return results
|
|
397
417
|
|
|
@@ -493,7 +513,9 @@ class DocumentScanner:
|
|
|
493
513
|
return {}
|
|
494
514
|
|
|
495
515
|
except Exception as e:
|
|
496
|
-
logger.warning(
|
|
516
|
+
logger.warning(
|
|
517
|
+
t("could_not_extract_text_from_file", file_path=file_path, error=e)
|
|
518
|
+
)
|
|
497
519
|
return {}
|
|
498
520
|
|
|
499
521
|
processing_time = time.perf_counter() - file_start_time
|
|
@@ -549,14 +571,18 @@ class DocumentScanner:
|
|
|
549
571
|
try:
|
|
550
572
|
return self._extract_pdf_fitz(file_path)
|
|
551
573
|
except Exception as e:
|
|
552
|
-
logger.warning(
|
|
574
|
+
logger.warning(
|
|
575
|
+
t("pymupdf_failed_for_file", file_name=file_path.name, error=e)
|
|
576
|
+
)
|
|
553
577
|
|
|
554
578
|
# Fallback to pypdf
|
|
555
579
|
if pypdf is not None:
|
|
556
580
|
try:
|
|
557
581
|
return self._extract_pdf_pypdf(file_path)
|
|
558
582
|
except Exception as e:
|
|
559
|
-
logger.error(
|
|
583
|
+
logger.error(
|
|
584
|
+
t("pypdf_also_failed_for_file", file_name=file_path.name, error=e)
|
|
585
|
+
)
|
|
560
586
|
return "", {}
|
|
561
587
|
|
|
562
588
|
logger.warning(t("no_pdf_library_installed"))
|
|
@@ -632,7 +658,9 @@ class DocumentScanner:
|
|
|
632
658
|
except Exception as e:
|
|
633
659
|
if doc:
|
|
634
660
|
doc.close()
|
|
635
|
-
logger.warning(
|
|
661
|
+
logger.warning(
|
|
662
|
+
t("pymupdf_error_trying_fallback", file_path=file_path, error=e)
|
|
663
|
+
)
|
|
636
664
|
# Re-raise to trigger fallback to pypdf
|
|
637
665
|
raise
|
|
638
666
|
|
|
@@ -764,8 +792,12 @@ class DocumentScanner:
|
|
|
764
792
|
text_parts.append(text)
|
|
765
793
|
|
|
766
794
|
metadata = {
|
|
767
|
-
"title": book.get_metadata("DC", "title")[0][0]
|
|
768
|
-
|
|
795
|
+
"title": book.get_metadata("DC", "title")[0][0]
|
|
796
|
+
if book.get_metadata("DC", "title")
|
|
797
|
+
else "", # pyright: ignore[reportAttributeAccessIssue]
|
|
798
|
+
"author": book.get_metadata("DC", "creator")[0][0]
|
|
799
|
+
if book.get_metadata("DC", "creator")
|
|
800
|
+
else "", # pyright: ignore[reportAttributeAccessIssue]
|
|
769
801
|
"format": "EPUB",
|
|
770
802
|
}
|
|
771
803
|
|
|
@@ -810,7 +842,9 @@ class DocumentScanner:
|
|
|
810
842
|
root = tree.getroot()
|
|
811
843
|
|
|
812
844
|
# Extract all text content
|
|
813
|
-
text_parts = [
|
|
845
|
+
text_parts = [
|
|
846
|
+
elem.text for elem in root.iter() if elem.text and elem.text.strip()
|
|
847
|
+
]
|
|
814
848
|
text = "\n".join(text_parts)
|
|
815
849
|
|
|
816
850
|
metadata = {
|
|
@@ -954,7 +988,9 @@ class DocumentScanner:
|
|
|
954
988
|
wb.close()
|
|
955
989
|
return "", {}
|
|
956
990
|
|
|
957
|
-
row_text = " | ".join(
|
|
991
|
+
row_text = " | ".join(
|
|
992
|
+
str(cell) if cell is not None else "" for cell in row
|
|
993
|
+
)
|
|
958
994
|
if row_text.strip():
|
|
959
995
|
text_parts.append(row_text)
|
|
960
996
|
|
|
@@ -1017,7 +1053,9 @@ class DocumentScanner:
|
|
|
1017
1053
|
|
|
1018
1054
|
return text, metadata
|
|
1019
1055
|
except Exception as e:
|
|
1020
|
-
logger.warning(
|
|
1056
|
+
logger.warning(
|
|
1057
|
+
t("could_not_perform_ocr_on_file", file_path=file_path, error=e)
|
|
1058
|
+
)
|
|
1021
1059
|
return "", {}
|
|
1022
1060
|
|
|
1023
1061
|
def _extract_text(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
@@ -1047,8 +1085,12 @@ def main():
|
|
|
1047
1085
|
USE_CHINESE = temp_args.lang == "zh"
|
|
1048
1086
|
|
|
1049
1087
|
parser = argparse.ArgumentParser(description=t("document_scanner_description"))
|
|
1050
|
-
parser.add_argument(
|
|
1051
|
-
|
|
1088
|
+
parser.add_argument(
|
|
1089
|
+
"input", type=str, nargs="?", default=str(cwd), help=t("input_directory_help")
|
|
1090
|
+
)
|
|
1091
|
+
parser.add_argument(
|
|
1092
|
+
"-r", "--rules", type=str, default="rules.json", help=t("rules_file_help")
|
|
1093
|
+
)
|
|
1052
1094
|
parser.add_argument("--recursive", action="store_true", help=t("recursive_help"))
|
|
1053
1095
|
parser.add_argument(
|
|
1054
1096
|
"-f",
|
|
@@ -1056,7 +1098,9 @@ def main():
|
|
|
1056
1098
|
help=t("file_types_help"),
|
|
1057
1099
|
default="pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md,jpg,jpeg,png,gif,bmp,tiff",
|
|
1058
1100
|
)
|
|
1059
|
-
parser.add_argument(
|
|
1101
|
+
parser.add_argument(
|
|
1102
|
+
"--use-pdf-ocr", help=t("use_pdf_ocr_help"), action="store_true"
|
|
1103
|
+
)
|
|
1060
1104
|
parser.add_argument(
|
|
1061
1105
|
"--use-process-pool",
|
|
1062
1106
|
help=t("use_process_pool_help"),
|
|
@@ -1074,7 +1118,9 @@ def main():
|
|
|
1074
1118
|
parser.add_argument("-v", "--verbose", help=t("verbose_help"), action="store_true")
|
|
1075
1119
|
|
|
1076
1120
|
# 添加语言参数
|
|
1077
|
-
parser.add_argument(
|
|
1121
|
+
parser.add_argument(
|
|
1122
|
+
"--lang", help=t("language_help"), choices=["en", "zh"], default="zh"
|
|
1123
|
+
)
|
|
1078
1124
|
|
|
1079
1125
|
args = parser.parse_args()
|
|
1080
1126
|
|
|
@@ -1129,11 +1175,20 @@ def main():
|
|
|
1129
1175
|
file_types = [ft.strip() for ft in args.file_types.split(",")]
|
|
1130
1176
|
|
|
1131
1177
|
# Create scanner and run scan
|
|
1132
|
-
scanner = DocumentScanner(
|
|
1178
|
+
scanner = DocumentScanner(
|
|
1179
|
+
input_dir,
|
|
1180
|
+
rules,
|
|
1181
|
+
file_types,
|
|
1182
|
+
args.use_pdf_ocr,
|
|
1183
|
+
args.use_process_pool,
|
|
1184
|
+
args.batch_size,
|
|
1185
|
+
)
|
|
1133
1186
|
results = scanner.scan(threads=args.threads, show_progress=args.progress)
|
|
1134
1187
|
|
|
1135
1188
|
# Save results to JSON file in input directory
|
|
1136
|
-
output_file =
|
|
1189
|
+
output_file = (
|
|
1190
|
+
input_dir / f"scan_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
1191
|
+
)
|
|
1137
1192
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
1138
1193
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
1139
1194
|
|