pysfi 0.1.7__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pysfi-0.1.7.dist-info → pysfi-0.1.11.dist-info}/METADATA +11 -9
- pysfi-0.1.11.dist-info/RECORD +60 -0
- pysfi-0.1.11.dist-info/entry_points.txt +28 -0
- sfi/__init__.py +1 -1
- sfi/alarmclock/alarmclock.py +40 -40
- sfi/bumpversion/__init__.py +1 -1
- sfi/cleanbuild/cleanbuild.py +155 -0
- sfi/condasetup/condasetup.py +116 -0
- sfi/docscan/__init__.py +1 -1
- sfi/docscan/docscan.py +407 -103
- sfi/docscan/docscan_gui.py +1282 -596
- sfi/docscan/lang/eng.py +152 -0
- sfi/docscan/lang/zhcn.py +170 -0
- sfi/filedate/filedate.py +185 -112
- sfi/gittool/__init__.py +2 -0
- sfi/gittool/gittool.py +401 -0
- sfi/llmclient/llmclient.py +592 -0
- sfi/llmquantize/llmquantize.py +480 -0
- sfi/llmserver/llmserver.py +335 -0
- sfi/makepython/makepython.py +31 -30
- sfi/pdfsplit/pdfsplit.py +173 -173
- sfi/pyarchive/pyarchive.py +418 -0
- sfi/pyembedinstall/pyembedinstall.py +629 -0
- sfi/pylibpack/__init__.py +0 -0
- sfi/pylibpack/pylibpack.py +1457 -0
- sfi/pylibpack/rules/numpy.json +22 -0
- sfi/pylibpack/rules/pymupdf.json +10 -0
- sfi/pylibpack/rules/pyqt5.json +19 -0
- sfi/pylibpack/rules/pyside2.json +23 -0
- sfi/pylibpack/rules/scipy.json +23 -0
- sfi/pylibpack/rules/shiboken2.json +24 -0
- sfi/pyloadergen/pyloadergen.py +512 -227
- sfi/pypack/__init__.py +0 -0
- sfi/pypack/pypack.py +1142 -0
- sfi/pyprojectparse/__init__.py +0 -0
- sfi/pyprojectparse/pyprojectparse.py +500 -0
- sfi/pysourcepack/pysourcepack.py +308 -0
- sfi/quizbase/__init__.py +0 -0
- sfi/quizbase/quizbase.py +828 -0
- sfi/quizbase/quizbase_gui.py +987 -0
- sfi/regexvalidate/__init__.py +0 -0
- sfi/regexvalidate/regex_help.html +284 -0
- sfi/regexvalidate/regexvalidate.py +468 -0
- sfi/taskkill/taskkill.py +0 -2
- sfi/workflowengine/__init__.py +0 -0
- sfi/workflowengine/workflowengine.py +444 -0
- pysfi-0.1.7.dist-info/RECORD +0 -31
- pysfi-0.1.7.dist-info/entry_points.txt +0 -15
- sfi/embedinstall/embedinstall.py +0 -418
- sfi/projectparse/projectparse.py +0 -152
- sfi/pypacker/fspacker.py +0 -91
- {pysfi-0.1.7.dist-info → pysfi-0.1.11.dist-info}/WHEEL +0 -0
- /sfi/{embedinstall → docscan/lang}/__init__.py +0 -0
- /sfi/{projectparse → llmquantize}/__init__.py +0 -0
- /sfi/{pypacker → pyembedinstall}/__init__.py +0 -0
sfi/docscan/docscan.py
CHANGED
|
@@ -3,11 +3,13 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import argparse
|
|
6
|
+
import contextlib
|
|
6
7
|
import csv
|
|
7
8
|
import html
|
|
8
9
|
import json
|
|
9
10
|
import logging
|
|
10
11
|
import re
|
|
12
|
+
import sys
|
|
11
13
|
import threading
|
|
12
14
|
import time
|
|
13
15
|
import xml.etree.ElementTree as ET
|
|
@@ -62,6 +64,42 @@ try:
|
|
|
62
64
|
except ImportError:
|
|
63
65
|
pypdf = None
|
|
64
66
|
|
|
67
|
+
# Language support imports
|
|
68
|
+
try:
|
|
69
|
+
from sfi.docscan.lang.eng import ENGLISH_DEFAULTS as EN_TRANSLATIONS
|
|
70
|
+
from sfi.docscan.lang.zhcn import TRANSLATIONS as ZH_TRANSLATIONS
|
|
71
|
+
except ImportError:
|
|
72
|
+
try:
|
|
73
|
+
from lang.eng import ENGLISH_DEFAULTS as EN_TRANSLATIONS
|
|
74
|
+
from lang.zhcn import TRANSLATIONS as ZH_TRANSLATIONS
|
|
75
|
+
except ImportError:
|
|
76
|
+
# Fallback translations if import fails
|
|
77
|
+
ZH_TRANSLATIONS = {}
|
|
78
|
+
EN_TRANSLATIONS = {}
|
|
79
|
+
|
|
80
|
+
# Global language setting
|
|
81
|
+
USE_CHINESE = True # Default to Chinese
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def t(key: str, **kwargs) -> str:
|
|
85
|
+
"""Get translated text for the given key.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
key: Translation key
|
|
89
|
+
**kwargs: Arguments for string formatting
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Translated text
|
|
93
|
+
"""
|
|
94
|
+
text = ZH_TRANSLATIONS.get(key, key) if USE_CHINESE else EN_TRANSLATIONS.get(key, key)
|
|
95
|
+
|
|
96
|
+
# Format with kwargs if provided
|
|
97
|
+
if kwargs:
|
|
98
|
+
with contextlib.suppress(KeyError, ValueError):
|
|
99
|
+
text = text.format(**kwargs)
|
|
100
|
+
return text
|
|
101
|
+
|
|
102
|
+
|
|
65
103
|
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
66
104
|
logger = logging.getLogger(__name__)
|
|
67
105
|
cwd = Path.cwd()
|
|
@@ -85,7 +123,7 @@ class Rule:
|
|
|
85
123
|
# Use re.ASCII for faster matching when possible
|
|
86
124
|
self.compiled_pattern = re.compile(self.pattern, flags | re.ASCII)
|
|
87
125
|
except re.error as e:
|
|
88
|
-
logger.warning(
|
|
126
|
+
logger.warning(t("invalid_regex_pattern", pattern=self.pattern, error=e))
|
|
89
127
|
self.compiled_pattern = None
|
|
90
128
|
else:
|
|
91
129
|
self.compiled_pattern = None
|
|
@@ -173,6 +211,7 @@ class DocumentScanner:
|
|
|
173
211
|
self.paused_event.set() # Initially not paused
|
|
174
212
|
self.stopped = False
|
|
175
213
|
self._progress_callback = None
|
|
214
|
+
self._executor = None # Keep reference to executor for forced shutdown
|
|
176
215
|
|
|
177
216
|
def set_progress_callback(self, callback: Callable[[int, int], None]) -> None:
|
|
178
217
|
"""Set callback function for progress updates.
|
|
@@ -186,19 +225,18 @@ class DocumentScanner:
|
|
|
186
225
|
"""Pause the scanning process."""
|
|
187
226
|
self.paused = True
|
|
188
227
|
self.paused_event.clear()
|
|
189
|
-
logger.info("Scan paused")
|
|
190
228
|
|
|
191
229
|
def resume(self) -> None:
|
|
192
230
|
"""Resume the scanning process."""
|
|
193
231
|
self.paused = False
|
|
194
232
|
self.paused_event.set()
|
|
195
|
-
logger.info("
|
|
233
|
+
logger.info(t("scan_resumed"))
|
|
196
234
|
|
|
197
235
|
def stop(self) -> None:
|
|
198
236
|
"""Stop the scanning process."""
|
|
199
237
|
self.stopped = True
|
|
200
238
|
self.paused_event.set() # Ensure thread can exit
|
|
201
|
-
logger.info("
|
|
239
|
+
logger.info(t("scan_stopped"))
|
|
202
240
|
|
|
203
241
|
def is_paused(self) -> bool:
|
|
204
242
|
"""Check if the scanner is paused."""
|
|
@@ -222,9 +260,9 @@ class DocumentScanner:
|
|
|
222
260
|
self.paused = False
|
|
223
261
|
self.paused_event.set()
|
|
224
262
|
|
|
225
|
-
logger.info(
|
|
263
|
+
logger.info(t("scanning_directory", directory=str(self.input_dir)))
|
|
226
264
|
files = self._collect_files()
|
|
227
|
-
logger.info(
|
|
265
|
+
logger.info(t("found_files_to_scan", count=len(files)))
|
|
228
266
|
|
|
229
267
|
results = {
|
|
230
268
|
"scan_info": {
|
|
@@ -243,53 +281,117 @@ class DocumentScanner:
|
|
|
243
281
|
# Scan files in parallel
|
|
244
282
|
processed = 0
|
|
245
283
|
executor_class = ProcessPoolExecutor if self.use_process_pool else ThreadPoolExecutor
|
|
246
|
-
|
|
247
|
-
|
|
284
|
+
executor = executor_class(max_workers=threads)
|
|
285
|
+
self._executor = executor # Keep reference for forced shutdown
|
|
248
286
|
|
|
249
|
-
|
|
287
|
+
try:
|
|
288
|
+
# Submit futures one by one to respect pause state
|
|
289
|
+
submitted_futures = []
|
|
290
|
+
was_paused = False # Track previous pause state
|
|
291
|
+
|
|
292
|
+
for file in files:
|
|
293
|
+
# Check if stopped before submitting more tasks
|
|
294
|
+
if self.stopped:
|
|
295
|
+
logger.info(t("scan_stopped_before_submitting_tasks"))
|
|
296
|
+
break
|
|
297
|
+
|
|
298
|
+
# Wait if paused before submitting new tasks
|
|
299
|
+
while self.paused:
|
|
300
|
+
# Log when entering paused state
|
|
301
|
+
if not was_paused:
|
|
302
|
+
logger.info(t("scan_paused"))
|
|
303
|
+
was_paused = True
|
|
304
|
+
|
|
305
|
+
self.paused_event.wait(0.1)
|
|
306
|
+
if self.stopped:
|
|
307
|
+
logger.info(t("scan_stopped_while_paused"))
|
|
308
|
+
break
|
|
309
|
+
|
|
310
|
+
# Log when exiting paused state
|
|
311
|
+
if was_paused and not self.paused:
|
|
312
|
+
logger.info(t("scan_resumed"))
|
|
313
|
+
was_paused = False
|
|
314
|
+
|
|
315
|
+
if self.stopped:
|
|
316
|
+
break
|
|
317
|
+
|
|
318
|
+
future = executor.submit(self._scan_file_with_pause_check, file)
|
|
319
|
+
submitted_futures.append(future)
|
|
320
|
+
|
|
321
|
+
# Process completed futures
|
|
322
|
+
for future in as_completed(submitted_futures):
|
|
250
323
|
# Check if stopped before processing this future
|
|
251
324
|
if self.stopped:
|
|
252
|
-
logger.info("
|
|
325
|
+
logger.info(t("scan_stopped_by_user_canceling_tasks"))
|
|
253
326
|
# Cancel all remaining futures
|
|
254
|
-
for f in
|
|
327
|
+
for f in submitted_futures:
|
|
255
328
|
if not f.done():
|
|
256
329
|
f.cancel()
|
|
257
330
|
break
|
|
258
331
|
|
|
259
|
-
# Wait if paused
|
|
332
|
+
# Wait if paused before processing result
|
|
260
333
|
while self.paused:
|
|
261
|
-
|
|
334
|
+
# Log when entering paused state
|
|
335
|
+
if not was_paused:
|
|
336
|
+
logger.info(t("scan_paused"))
|
|
337
|
+
was_paused = True
|
|
338
|
+
|
|
339
|
+
self.paused_event.wait(0.1)
|
|
262
340
|
if self.stopped:
|
|
263
|
-
logger.info("
|
|
341
|
+
logger.info(t("scan_stopped_while_paused"))
|
|
264
342
|
break
|
|
265
343
|
|
|
266
|
-
|
|
344
|
+
# Log when exiting paused state
|
|
345
|
+
if was_paused and not self.paused:
|
|
346
|
+
logger.info(t("scan_resumed"))
|
|
347
|
+
was_paused = False
|
|
348
|
+
|
|
349
|
+
if self.stopped:
|
|
350
|
+
break
|
|
351
|
+
|
|
267
352
|
try:
|
|
268
|
-
file_result = future.result()
|
|
353
|
+
file_result = future.result(timeout=1.0) # Short timeout to allow quick stop
|
|
269
354
|
if file_result and file_result["matches"]:
|
|
270
355
|
results["matches"].append(file_result)
|
|
271
|
-
logger.info(
|
|
356
|
+
logger.info(t("found_matches_in_file", file_name=Path(file_result.get("file_path", "")).name))
|
|
357
|
+
except TimeoutError:
|
|
358
|
+
logger.warning(t("task_timeout_scan_may_be_stopping"))
|
|
359
|
+
if self.stopped:
|
|
360
|
+
break
|
|
272
361
|
except Exception as e:
|
|
273
|
-
|
|
362
|
+
if not self.stopped:
|
|
363
|
+
logger.error(t("error_scanning_file", error=e))
|
|
274
364
|
|
|
275
365
|
processed += 1
|
|
276
366
|
|
|
277
367
|
# Report progress
|
|
278
368
|
if show_progress and processed % 10 == 0:
|
|
279
|
-
logger.info(
|
|
369
|
+
logger.info(t("progress_report", processed=processed, total=len(files)))
|
|
280
370
|
|
|
281
371
|
# Call progress callback if set
|
|
282
372
|
if self._progress_callback:
|
|
283
373
|
self._progress_callback(processed, len(files))
|
|
284
374
|
|
|
375
|
+
finally:
|
|
376
|
+
# Force shutdown if stopped
|
|
377
|
+
if self.stopped and self._executor:
|
|
378
|
+
logger.info(t("force_shutting_down_executor"))
|
|
379
|
+
if sys.version_info >= (3, 9):
|
|
380
|
+
self._executor.shutdown(wait=False, cancel_futures=True)
|
|
381
|
+
else:
|
|
382
|
+
self._executor.shutdown(wait=False)
|
|
383
|
+
else:
|
|
384
|
+
self._executor.shutdown(wait=True)
|
|
385
|
+
self._executor = None
|
|
386
|
+
|
|
285
387
|
results["scan_info"]["files_with_matches"] = len(results["matches"])
|
|
286
388
|
results["scan_info"]["files_processed"] = processed
|
|
287
389
|
results["stopped"] = self.stopped
|
|
288
390
|
|
|
289
391
|
if self.stopped:
|
|
290
|
-
logger.info(
|
|
392
|
+
logger.info(t("scan_stopped_processed_files", processed=processed))
|
|
291
393
|
else:
|
|
292
|
-
logger.info(
|
|
394
|
+
logger.info(t("scan_complete_found_matches", matches_count=len(results["matches"])))
|
|
293
395
|
|
|
294
396
|
return results
|
|
295
397
|
|
|
@@ -299,24 +401,55 @@ class DocumentScanner:
|
|
|
299
401
|
if self.stopped:
|
|
300
402
|
return {}
|
|
301
403
|
|
|
404
|
+
# Check if paused before processing
|
|
405
|
+
while self.paused:
|
|
406
|
+
self.paused_event.wait(0.1)
|
|
407
|
+
if self.stopped:
|
|
408
|
+
return {}
|
|
409
|
+
|
|
302
410
|
return self._scan_file(file_path)
|
|
303
411
|
|
|
304
412
|
def _collect_files(self) -> list[Path]:
|
|
305
413
|
"""Collect all files matching the specified types."""
|
|
306
414
|
files = []
|
|
415
|
+
image_extensions = ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]
|
|
416
|
+
|
|
307
417
|
for ext in self.file_types:
|
|
418
|
+
# If extension is an image format and OCR is not enabled, skip
|
|
419
|
+
if ext.lower() in image_extensions and not self.use_pdf_ocr:
|
|
420
|
+
continue
|
|
308
421
|
files.extend(self.input_dir.rglob(f"*.{ext.lower()}"))
|
|
309
422
|
files.extend(self.input_dir.rglob(f"*.{ext.upper()}"))
|
|
310
423
|
return list(set(files)) # Remove duplicates
|
|
311
424
|
|
|
312
425
|
def _scan_file(self, file_path: Path) -> dict[str, Any]:
|
|
313
426
|
"""Scan a single file and return matches."""
|
|
427
|
+
# Check if stopped before starting
|
|
428
|
+
if self.stopped:
|
|
429
|
+
return {}
|
|
430
|
+
|
|
431
|
+
# Check if paused before starting
|
|
432
|
+
while self.paused:
|
|
433
|
+
self.paused_event.wait(0.1)
|
|
434
|
+
if self.stopped:
|
|
435
|
+
return {}
|
|
436
|
+
|
|
314
437
|
file_start_time = time.perf_counter()
|
|
315
438
|
ext = file_path.suffix.lower().lstrip(".")
|
|
316
439
|
text = ""
|
|
317
440
|
metadata = {}
|
|
318
441
|
|
|
319
442
|
try:
|
|
443
|
+
# Check if stopped before extraction
|
|
444
|
+
if self.stopped:
|
|
445
|
+
return {}
|
|
446
|
+
|
|
447
|
+
# Check if paused before extraction
|
|
448
|
+
while self.paused:
|
|
449
|
+
self.paused_event.wait(0.1)
|
|
450
|
+
if self.stopped:
|
|
451
|
+
return {}
|
|
452
|
+
|
|
320
453
|
# Route to appropriate extractor
|
|
321
454
|
if ext == "pdf":
|
|
322
455
|
text, metadata = self._extract_pdf(file_path)
|
|
@@ -341,11 +474,26 @@ class DocumentScanner:
|
|
|
341
474
|
elif ext in ["pptx", "ppt"]:
|
|
342
475
|
text, metadata = self._extract_pptx(file_path)
|
|
343
476
|
elif ext in ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]:
|
|
344
|
-
|
|
477
|
+
# Only extract images if OCR is enabled
|
|
478
|
+
if self.use_pdf_ocr: # Using the same flag for consistency
|
|
479
|
+
text, metadata = self._extract_image(file_path)
|
|
480
|
+
else:
|
|
481
|
+
return {} # Skip image files if OCR is disabled
|
|
345
482
|
else:
|
|
346
483
|
text, metadata = self._extract_text(file_path)
|
|
484
|
+
|
|
485
|
+
# Check if stopped after extraction
|
|
486
|
+
if self.stopped:
|
|
487
|
+
return {}
|
|
488
|
+
|
|
489
|
+
# Check if paused after extraction
|
|
490
|
+
while self.paused:
|
|
491
|
+
self.paused_event.wait(0.1)
|
|
492
|
+
if self.stopped:
|
|
493
|
+
return {}
|
|
494
|
+
|
|
347
495
|
except Exception as e:
|
|
348
|
-
logger.warning(
|
|
496
|
+
logger.warning(t("could_not_extract_text_from_file", file_path=file_path, error=e))
|
|
349
497
|
return {}
|
|
350
498
|
|
|
351
499
|
processing_time = time.perf_counter() - file_start_time
|
|
@@ -353,9 +501,16 @@ class DocumentScanner:
|
|
|
353
501
|
if not text:
|
|
354
502
|
return {}
|
|
355
503
|
|
|
356
|
-
# Apply all rules
|
|
504
|
+
# Apply all rules with stop check
|
|
357
505
|
file_matches = []
|
|
358
506
|
for rule in self.rules:
|
|
507
|
+
if self.stopped:
|
|
508
|
+
return {}
|
|
509
|
+
# Check if paused before each rule
|
|
510
|
+
while self.paused:
|
|
511
|
+
self.paused_event.wait(0.1)
|
|
512
|
+
if self.stopped:
|
|
513
|
+
return {}
|
|
359
514
|
rule_matches = rule.search(text)
|
|
360
515
|
if rule_matches:
|
|
361
516
|
for match in rule_matches:
|
|
@@ -369,7 +524,15 @@ class DocumentScanner:
|
|
|
369
524
|
# Add processing time to metadata
|
|
370
525
|
metadata["processing_time_seconds"] = round(processing_time, 3)
|
|
371
526
|
|
|
372
|
-
logger.info(
|
|
527
|
+
logger.info(
|
|
528
|
+
t(
|
|
529
|
+
"processed_file_info",
|
|
530
|
+
file_name=file_path.name,
|
|
531
|
+
ext=ext,
|
|
532
|
+
time=round(processing_time, 3),
|
|
533
|
+
matches_count=len(file_matches),
|
|
534
|
+
)
|
|
535
|
+
)
|
|
373
536
|
|
|
374
537
|
return {
|
|
375
538
|
"file_path": str(file_path),
|
|
@@ -381,60 +544,102 @@ class DocumentScanner:
|
|
|
381
544
|
|
|
382
545
|
def _extract_pdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
383
546
|
"""Extract text from PDF file with fallback."""
|
|
547
|
+
# Try PyMuPDF first (faster)
|
|
384
548
|
if fitz is not None:
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
549
|
+
try:
|
|
550
|
+
return self._extract_pdf_fitz(file_path)
|
|
551
|
+
except Exception as e:
|
|
552
|
+
logger.warning(t("pymupdf_failed_for_file", file_name=file_path.name, error=e))
|
|
553
|
+
|
|
554
|
+
# Fallback to pypdf
|
|
555
|
+
if pypdf is not None:
|
|
556
|
+
try:
|
|
557
|
+
return self._extract_pdf_pypdf(file_path)
|
|
558
|
+
except Exception as e:
|
|
559
|
+
logger.error(t("pypdf_also_failed_for_file", file_name=file_path.name, error=e))
|
|
560
|
+
return "", {}
|
|
561
|
+
|
|
562
|
+
logger.warning(t("no_pdf_library_installed"))
|
|
563
|
+
return "", {}
|
|
391
564
|
|
|
392
565
|
def _extract_pdf_fitz(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
393
566
|
"""Extract text from PDF using PyMuPDF (fastest method)."""
|
|
394
567
|
if not fitz:
|
|
395
|
-
logger.warning("
|
|
568
|
+
logger.warning(t("pymupdf_not_installed"))
|
|
396
569
|
return "", {}
|
|
397
570
|
|
|
398
|
-
doc =
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
571
|
+
doc = None
|
|
572
|
+
try:
|
|
573
|
+
doc = fitz.open(str(file_path))
|
|
574
|
+
if doc.page_count == 0:
|
|
575
|
+
logger.warning(t("no_pages_found_in_file", file_path=file_path))
|
|
576
|
+
return "", {}
|
|
577
|
+
if not doc.metadata:
|
|
578
|
+
logger.warning(t("no_metadata_found_in_file", file_path=file_path))
|
|
579
|
+
return "", {}
|
|
405
580
|
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
581
|
+
text_parts = []
|
|
582
|
+
metadata = {
|
|
583
|
+
"page_count": doc.page_count,
|
|
584
|
+
"title": doc.metadata.get("title", ""),
|
|
585
|
+
"author": doc.metadata.get("author", ""),
|
|
586
|
+
"subject": doc.metadata.get("subject", ""),
|
|
587
|
+
"creator": doc.metadata.get("creator", ""),
|
|
588
|
+
}
|
|
414
589
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
590
|
+
if self.use_pdf_ocr and pytesseract and Image:
|
|
591
|
+
# OCR for image-based PDFs
|
|
592
|
+
import io
|
|
418
593
|
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
594
|
+
for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
|
|
595
|
+
# Check if stopped before processing each page
|
|
596
|
+
if self.stopped:
|
|
597
|
+
doc.close()
|
|
598
|
+
return "", {}
|
|
599
|
+
|
|
600
|
+
# Check if paused before processing each page
|
|
601
|
+
while self.paused:
|
|
602
|
+
self.paused_event.wait(0.1)
|
|
603
|
+
if self.stopped:
|
|
604
|
+
doc.close()
|
|
605
|
+
return "", {}
|
|
606
|
+
|
|
607
|
+
pix = page.get_pixmap()
|
|
608
|
+
img_data = pix.tobytes("png")
|
|
609
|
+
image = Image.open(io.BytesIO(img_data))
|
|
610
|
+
text = pytesseract.image_to_string(image)
|
|
611
|
+
text_parts.append(f"[Page {page_num}]\n{text}")
|
|
612
|
+
else:
|
|
613
|
+
# Extract text directly (faster)
|
|
614
|
+
for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
|
|
615
|
+
# Check if stopped before processing each page
|
|
616
|
+
if self.stopped:
|
|
617
|
+
doc.close()
|
|
618
|
+
return "", {}
|
|
430
619
|
|
|
431
|
-
|
|
432
|
-
|
|
620
|
+
# Check if paused before processing each page
|
|
621
|
+
while self.paused:
|
|
622
|
+
self.paused_event.wait(0.1)
|
|
623
|
+
if self.stopped:
|
|
624
|
+
doc.close()
|
|
625
|
+
return "", {}
|
|
626
|
+
|
|
627
|
+
text = page.get_text()
|
|
628
|
+
text_parts.append(f"[Page {page_num}]\n{text}")
|
|
629
|
+
|
|
630
|
+
doc.close()
|
|
631
|
+
return "\n\n".join(text_parts), metadata
|
|
632
|
+
except Exception as e:
|
|
633
|
+
if doc:
|
|
634
|
+
doc.close()
|
|
635
|
+
logger.warning(t("pymupdf_error_trying_fallback", file_path=file_path, error=e))
|
|
636
|
+
# Re-raise to trigger fallback to pypdf
|
|
637
|
+
raise
|
|
433
638
|
|
|
434
639
|
def _extract_pdf_pypdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
435
640
|
"""Extract text from PDF using pypdf (fallback method)."""
|
|
436
641
|
if not pypdf:
|
|
437
|
-
logger.warning("
|
|
642
|
+
logger.warning(t("pypdf_not_installed_skipping_extraction"))
|
|
438
643
|
return "", {}
|
|
439
644
|
|
|
440
645
|
text_parts = []
|
|
@@ -444,7 +649,7 @@ class DocumentScanner:
|
|
|
444
649
|
pdf_reader = pypdf.PdfReader(f)
|
|
445
650
|
|
|
446
651
|
if not pdf_reader.metadata:
|
|
447
|
-
logger.warning(
|
|
652
|
+
logger.warning(t("no_metadata_found_in_file", file_path=file_path))
|
|
448
653
|
return "", {}
|
|
449
654
|
|
|
450
655
|
metadata = {
|
|
@@ -454,11 +659,21 @@ class DocumentScanner:
|
|
|
454
659
|
}
|
|
455
660
|
|
|
456
661
|
for page_num, page in enumerate(pdf_reader.pages, 1):
|
|
662
|
+
# Check if stopped before processing each page
|
|
663
|
+
if self.stopped:
|
|
664
|
+
return "", {}
|
|
665
|
+
|
|
666
|
+
# Check if paused before processing each page
|
|
667
|
+
while self.paused:
|
|
668
|
+
self.paused_event.wait(0.1)
|
|
669
|
+
if self.stopped:
|
|
670
|
+
return "", {}
|
|
671
|
+
|
|
457
672
|
text = page.extract_text()
|
|
458
673
|
text_parts.append(f"[Page {page_num}]\n{text}")
|
|
459
674
|
|
|
460
675
|
except Exception as e:
|
|
461
|
-
logger.warning(
|
|
676
|
+
logger.warning(t("error_extracting_pdf_with_pypdf", error=e))
|
|
462
677
|
return "", {}
|
|
463
678
|
|
|
464
679
|
return "\n\n".join(text_parts), metadata
|
|
@@ -466,7 +681,7 @@ class DocumentScanner:
|
|
|
466
681
|
def _extract_odt(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
467
682
|
"""Extract text from ODT (OpenDocument Text) file."""
|
|
468
683
|
if odf_odt is None:
|
|
469
|
-
logger.warning("
|
|
684
|
+
logger.warning(t("odfpy_not_installed_skipping_extraction"))
|
|
470
685
|
return "", {}
|
|
471
686
|
|
|
472
687
|
try:
|
|
@@ -479,7 +694,7 @@ class DocumentScanner:
|
|
|
479
694
|
|
|
480
695
|
return text, metadata
|
|
481
696
|
except Exception as e:
|
|
482
|
-
logger.warning(
|
|
697
|
+
logger.warning(t("error_extracting_odt", error=e))
|
|
483
698
|
return "", {}
|
|
484
699
|
|
|
485
700
|
def _extract_rtf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
@@ -514,13 +729,13 @@ class DocumentScanner:
|
|
|
514
729
|
|
|
515
730
|
return text, metadata
|
|
516
731
|
except Exception as e:
|
|
517
|
-
logger.warning(
|
|
732
|
+
logger.warning(t("error_extracting_rtf", error=e))
|
|
518
733
|
return "", {}
|
|
519
734
|
|
|
520
735
|
def _extract_epub(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
521
736
|
"""Extract text from EPUB (ebook) file."""
|
|
522
737
|
if ebooklib is None:
|
|
523
|
-
logger.warning("
|
|
738
|
+
logger.warning(t("ebooklib_not_installed_skipping_extraction"))
|
|
524
739
|
return "", {}
|
|
525
740
|
|
|
526
741
|
try:
|
|
@@ -529,6 +744,16 @@ class DocumentScanner:
|
|
|
529
744
|
|
|
530
745
|
# Extract text from all items
|
|
531
746
|
for item in book.get_items():
|
|
747
|
+
# Check if stopped before processing each item
|
|
748
|
+
if self.stopped:
|
|
749
|
+
return "", {}
|
|
750
|
+
|
|
751
|
+
# Check if paused before processing each item
|
|
752
|
+
while self.paused:
|
|
753
|
+
self.paused_event.wait(0.1)
|
|
754
|
+
if self.stopped:
|
|
755
|
+
return "", {}
|
|
756
|
+
|
|
532
757
|
if item.get_type() == ebooklib.ITEM_DOCUMENT: # pyright: ignore[reportAttributeAccessIssue]
|
|
533
758
|
# Remove HTML tags
|
|
534
759
|
html_content = item.get_content().decode("utf-8") # pyright: ignore[reportAttributeAccessIssue]
|
|
@@ -546,7 +771,7 @@ class DocumentScanner:
|
|
|
546
771
|
|
|
547
772
|
return "\n\n".join(text_parts), metadata
|
|
548
773
|
except Exception as e:
|
|
549
|
-
logger.warning(
|
|
774
|
+
logger.warning(t("error_extracting_epub", error=e))
|
|
550
775
|
return "", {}
|
|
551
776
|
|
|
552
777
|
def _extract_csv(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
@@ -556,6 +781,16 @@ class DocumentScanner:
|
|
|
556
781
|
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
|
557
782
|
reader = csv.reader(f)
|
|
558
783
|
for row in reader:
|
|
784
|
+
# Check if stopped periodically during row processing
|
|
785
|
+
if self.stopped:
|
|
786
|
+
return "", {}
|
|
787
|
+
|
|
788
|
+
# Check if paused periodically during row processing
|
|
789
|
+
while self.paused:
|
|
790
|
+
self.paused_event.wait(0.1)
|
|
791
|
+
if self.stopped:
|
|
792
|
+
return "", {}
|
|
793
|
+
|
|
559
794
|
row_text = " | ".join(str(cell) for cell in row)
|
|
560
795
|
text_parts.append(row_text)
|
|
561
796
|
|
|
@@ -565,7 +800,7 @@ class DocumentScanner:
|
|
|
565
800
|
|
|
566
801
|
return "\n".join(text_parts), metadata
|
|
567
802
|
except Exception as e:
|
|
568
|
-
logger.warning(
|
|
803
|
+
logger.warning(t("error_extracting_csv", error=e))
|
|
569
804
|
return "", {}
|
|
570
805
|
|
|
571
806
|
def _extract_xml(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
@@ -585,7 +820,7 @@ class DocumentScanner:
|
|
|
585
820
|
|
|
586
821
|
return text, metadata
|
|
587
822
|
except Exception as e:
|
|
588
|
-
logger.warning(
|
|
823
|
+
logger.warning(t("error_extracting_xml", error=e))
|
|
589
824
|
return "", {}
|
|
590
825
|
|
|
591
826
|
def _extract_html(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
@@ -607,7 +842,7 @@ class DocumentScanner:
|
|
|
607
842
|
|
|
608
843
|
return text, metadata
|
|
609
844
|
except Exception as e:
|
|
610
|
-
logger.warning(
|
|
845
|
+
logger.warning(t("error_extracting_html", error=e))
|
|
611
846
|
return "", {}
|
|
612
847
|
|
|
613
848
|
def _extract_markdown(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
@@ -634,23 +869,43 @@ class DocumentScanner:
|
|
|
634
869
|
|
|
635
870
|
return text, metadata
|
|
636
871
|
except Exception as e:
|
|
637
|
-
logger.warning(
|
|
872
|
+
logger.warning(t("error_extracting_markdown", error=e))
|
|
638
873
|
return "", {}
|
|
639
874
|
|
|
640
875
|
def _extract_docx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
641
876
|
"""Extract text from DOCX file."""
|
|
642
877
|
if Document is None:
|
|
643
|
-
logger.warning("
|
|
878
|
+
logger.warning(t("python_docx_not_installed_skipping_extraction"))
|
|
644
879
|
return "", {}
|
|
645
880
|
|
|
646
881
|
doc = Document(str(file_path))
|
|
647
882
|
text_parts = []
|
|
648
883
|
|
|
649
884
|
for paragraph in doc.paragraphs:
|
|
885
|
+
# Check if stopped periodically during paragraph processing
|
|
886
|
+
if self.stopped:
|
|
887
|
+
return "", {}
|
|
888
|
+
|
|
889
|
+
# Check if paused periodically during paragraph processing
|
|
890
|
+
while self.paused:
|
|
891
|
+
self.paused_event.wait(0.1)
|
|
892
|
+
if self.stopped:
|
|
893
|
+
return "", {}
|
|
894
|
+
|
|
650
895
|
text_parts.append(paragraph.text)
|
|
651
896
|
|
|
652
897
|
# Extract tables
|
|
653
898
|
for table in doc.tables:
|
|
899
|
+
# Check if stopped before processing each table
|
|
900
|
+
if self.stopped:
|
|
901
|
+
return "", {}
|
|
902
|
+
|
|
903
|
+
# Check if paused before processing each table
|
|
904
|
+
while self.paused:
|
|
905
|
+
self.paused_event.wait(0.1)
|
|
906
|
+
if self.stopped:
|
|
907
|
+
return "", {}
|
|
908
|
+
|
|
654
909
|
for row in table.rows:
|
|
655
910
|
row_text = " | ".join(cell.text for cell in row.cells)
|
|
656
911
|
text_parts.append(row_text)
|
|
@@ -665,16 +920,40 @@ class DocumentScanner:
|
|
|
665
920
|
def _extract_xlsx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
666
921
|
"""Extract text from XLSX file."""
|
|
667
922
|
if load_workbook is None:
|
|
668
|
-
logger.warning("
|
|
923
|
+
logger.warning(t("openpyxl_not_installed_skipping_extraction"))
|
|
669
924
|
return "", {}
|
|
670
925
|
|
|
671
926
|
wb = load_workbook(file_path, read_only=True, data_only=True)
|
|
672
927
|
text_parts = []
|
|
673
928
|
|
|
674
929
|
for sheet_name in wb.sheetnames:
|
|
930
|
+
# Check if stopped before processing each sheet
|
|
931
|
+
if self.stopped:
|
|
932
|
+
wb.close()
|
|
933
|
+
return "", {}
|
|
934
|
+
|
|
935
|
+
# Check if paused before processing each sheet
|
|
936
|
+
while self.paused:
|
|
937
|
+
self.paused_event.wait(0.1)
|
|
938
|
+
if self.stopped:
|
|
939
|
+
wb.close()
|
|
940
|
+
return "", {}
|
|
941
|
+
|
|
675
942
|
sheet = wb[sheet_name]
|
|
676
943
|
text_parts.append(f"[Sheet: {sheet_name}]")
|
|
677
944
|
for row in sheet.iter_rows(values_only=True):
|
|
945
|
+
# Check if stopped periodically during row processing
|
|
946
|
+
if self.stopped:
|
|
947
|
+
wb.close()
|
|
948
|
+
return "", {}
|
|
949
|
+
|
|
950
|
+
# Check if paused periodically during row processing
|
|
951
|
+
while self.paused:
|
|
952
|
+
self.paused_event.wait(0.1)
|
|
953
|
+
if self.stopped:
|
|
954
|
+
wb.close()
|
|
955
|
+
return "", {}
|
|
956
|
+
|
|
678
957
|
row_text = " | ".join(str(cell) if cell is not None else "" for cell in row)
|
|
679
958
|
if row_text.strip():
|
|
680
959
|
text_parts.append(row_text)
|
|
@@ -684,6 +963,7 @@ class DocumentScanner:
|
|
|
684
963
|
"sheets": wb.sheetnames,
|
|
685
964
|
}
|
|
686
965
|
|
|
966
|
+
wb.close()
|
|
687
967
|
return "\n".join(text_parts), metadata
|
|
688
968
|
|
|
689
969
|
def _extract_pptx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
@@ -691,13 +971,23 @@ class DocumentScanner:
|
|
|
691
971
|
try:
|
|
692
972
|
from pptx import Presentation
|
|
693
973
|
except ImportError:
|
|
694
|
-
logger.warning("
|
|
974
|
+
logger.warning(t("python_pptx_not_installed_skipping_extraction"))
|
|
695
975
|
return "", {}
|
|
696
976
|
|
|
697
977
|
prs = Presentation(str(file_path))
|
|
698
978
|
text_parts = []
|
|
699
979
|
|
|
700
980
|
for slide_num, slide in enumerate(prs.slides, 1):
|
|
981
|
+
# Check if stopped before processing each slide
|
|
982
|
+
if self.stopped:
|
|
983
|
+
return "", {}
|
|
984
|
+
|
|
985
|
+
# Check if paused before processing each slide
|
|
986
|
+
while self.paused:
|
|
987
|
+
self.paused_event.wait(0.1)
|
|
988
|
+
if self.stopped:
|
|
989
|
+
return "", {}
|
|
990
|
+
|
|
701
991
|
text_parts.append(f"[Slide {slide_num}]")
|
|
702
992
|
for shape in slide.shapes:
|
|
703
993
|
if hasattr(shape, "text"):
|
|
@@ -712,7 +1002,7 @@ class DocumentScanner:
|
|
|
712
1002
|
def _extract_image(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
713
1003
|
"""Extract text from image file using OCR."""
|
|
714
1004
|
if Image is None or pytesseract is None:
|
|
715
|
-
logger.warning("
|
|
1005
|
+
logger.warning(t("pillow_or_tesseract_not_installed_skipping_ocr"))
|
|
716
1006
|
return "", {}
|
|
717
1007
|
|
|
718
1008
|
try:
|
|
@@ -727,7 +1017,7 @@ class DocumentScanner:
|
|
|
727
1017
|
|
|
728
1018
|
return text, metadata
|
|
729
1019
|
except Exception as e:
|
|
730
|
-
logger.warning(
|
|
1020
|
+
logger.warning(t("could_not_perform_ocr_on_file", file_path=file_path, error=e))
|
|
731
1021
|
return "", {}
|
|
732
1022
|
|
|
733
1023
|
def _extract_text(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
@@ -747,36 +1037,50 @@ class DocumentScanner:
|
|
|
747
1037
|
|
|
748
1038
|
def main():
|
|
749
1039
|
"""Main entry point for document scanner."""
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
)
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
1040
|
+
# 首先解析语言参数,但不使用翻译
|
|
1041
|
+
temp_parser = argparse.ArgumentParser(add_help=False)
|
|
1042
|
+
temp_parser.add_argument("--lang", choices=["en", "zh"], default="zh")
|
|
1043
|
+
temp_args, _ = temp_parser.parse_known_args()
|
|
1044
|
+
|
|
1045
|
+
# 设置语言
|
|
1046
|
+
global USE_CHINESE
|
|
1047
|
+
USE_CHINESE = temp_args.lang == "zh"
|
|
1048
|
+
|
|
1049
|
+
parser = argparse.ArgumentParser(description=t("document_scanner_description"))
|
|
1050
|
+
parser.add_argument("input", type=str, nargs="?", default=str(cwd), help=t("input_directory_help"))
|
|
1051
|
+
parser.add_argument("-r", "--rules", type=str, default="rules.json", help=t("rules_file_help"))
|
|
1052
|
+
parser.add_argument("--recursive", action="store_true", help=t("recursive_help"))
|
|
756
1053
|
parser.add_argument(
|
|
757
1054
|
"-f",
|
|
758
1055
|
"--file-types",
|
|
759
|
-
help="
|
|
760
|
-
default="pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md",
|
|
1056
|
+
help=t("file_types_help"),
|
|
1057
|
+
default="pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md,jpg,jpeg,png,gif,bmp,tiff",
|
|
761
1058
|
)
|
|
762
|
-
parser.add_argument("--use-pdf-ocr", help="
|
|
1059
|
+
parser.add_argument("--use-pdf-ocr", help=t("use_pdf_ocr_help"), action="store_true")
|
|
763
1060
|
parser.add_argument(
|
|
764
1061
|
"--use-process-pool",
|
|
765
|
-
help="
|
|
1062
|
+
help=t("use_process_pool_help"),
|
|
766
1063
|
action="store_true",
|
|
767
1064
|
)
|
|
768
1065
|
parser.add_argument(
|
|
769
1066
|
"-b",
|
|
770
1067
|
"--batch-size",
|
|
771
|
-
help="
|
|
1068
|
+
help=t("batch_size_help"),
|
|
772
1069
|
default=50,
|
|
773
1070
|
type=int,
|
|
774
1071
|
)
|
|
775
|
-
parser.add_argument("-t", "--threads", help="
|
|
776
|
-
parser.add_argument("--progress", help="
|
|
777
|
-
parser.add_argument("-v", "--verbose", help="
|
|
1072
|
+
parser.add_argument("-t", "--threads", help=t("threads_help"), default=4, type=int)
|
|
1073
|
+
parser.add_argument("--progress", help=t("progress_help"), action="store_true")
|
|
1074
|
+
parser.add_argument("-v", "--verbose", help=t("verbose_help"), action="store_true")
|
|
1075
|
+
|
|
1076
|
+
# 添加语言参数
|
|
1077
|
+
parser.add_argument("--lang", help=t("language_help"), choices=["en", "zh"], default="zh")
|
|
1078
|
+
|
|
778
1079
|
args = parser.parse_args()
|
|
779
1080
|
|
|
1081
|
+
# 再次确认语言设置(以防万一用户在完整参数中改变了语言)
|
|
1082
|
+
USE_CHINESE = args.lang == "zh"
|
|
1083
|
+
|
|
780
1084
|
if args.verbose:
|
|
781
1085
|
logger.setLevel(logging.DEBUG)
|
|
782
1086
|
|
|
@@ -784,9 +1088,9 @@ def main():
|
|
|
784
1088
|
# Validate input directory
|
|
785
1089
|
input_dir = Path(args.input)
|
|
786
1090
|
if not input_dir.exists() or not input_dir.is_dir():
|
|
787
|
-
logger.error(
|
|
1091
|
+
logger.error(t("input_directory_does_not_exist", input_dir=args.input))
|
|
788
1092
|
return
|
|
789
|
-
logger.info(
|
|
1093
|
+
logger.info(t("scanning_directory", directory=str(input_dir)))
|
|
790
1094
|
|
|
791
1095
|
# Load rules file
|
|
792
1096
|
rules_file = Path(args.rules)
|
|
@@ -796,15 +1100,15 @@ def main():
|
|
|
796
1100
|
if rule_files_in_input_dir:
|
|
797
1101
|
rules_file = rule_files_in_input_dir[0]
|
|
798
1102
|
else:
|
|
799
|
-
logger.error(
|
|
1103
|
+
logger.error(t("rules_file_does_not_exist_alt", rules_file=args.rules))
|
|
800
1104
|
return
|
|
801
|
-
logger.info(
|
|
1105
|
+
logger.info(t("using_rules_file", rules_file=str(rules_file)))
|
|
802
1106
|
|
|
803
1107
|
try:
|
|
804
1108
|
with open(rules_file, encoding="utf-8") as f:
|
|
805
1109
|
rules_data = json.load(f)
|
|
806
1110
|
except json.JSONDecodeError as e:
|
|
807
|
-
logger.error(
|
|
1111
|
+
logger.error(t("invalid_json_in_rules_file", error=e))
|
|
808
1112
|
return
|
|
809
1113
|
|
|
810
1114
|
# Parse rules
|
|
@@ -814,11 +1118,11 @@ def main():
|
|
|
814
1118
|
elif isinstance(rules_data, dict) and "rules" in rules_data:
|
|
815
1119
|
rules = [Rule(rule) for rule in rules_data["rules"]]
|
|
816
1120
|
else:
|
|
817
|
-
logger.error("
|
|
1121
|
+
logger.error(t("invalid_rules_format"))
|
|
818
1122
|
return
|
|
819
1123
|
|
|
820
1124
|
if not rules:
|
|
821
|
-
logger.error("
|
|
1125
|
+
logger.error(t("no_valid_rules_found"))
|
|
822
1126
|
return
|
|
823
1127
|
|
|
824
1128
|
# Parse file types
|
|
@@ -833,8 +1137,8 @@ def main():
|
|
|
833
1137
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
834
1138
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
835
1139
|
|
|
836
|
-
logger.info(
|
|
837
|
-
logger.info(
|
|
1140
|
+
logger.info(t("results_saved_to", path=str(output_file)))
|
|
1141
|
+
logger.info(t("total_time_elapsed", time=round(time.perf_counter() - t0, 2)))
|
|
838
1142
|
|
|
839
1143
|
|
|
840
1144
|
if __name__ == "__main__":
|