pysfi 0.1.7__py3-none-any.whl → 0.1.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {pysfi-0.1.7.dist-info → pysfi-0.1.11.dist-info}/METADATA +11 -9
  2. pysfi-0.1.11.dist-info/RECORD +60 -0
  3. pysfi-0.1.11.dist-info/entry_points.txt +28 -0
  4. sfi/__init__.py +1 -1
  5. sfi/alarmclock/alarmclock.py +40 -40
  6. sfi/bumpversion/__init__.py +1 -1
  7. sfi/cleanbuild/cleanbuild.py +155 -0
  8. sfi/condasetup/condasetup.py +116 -0
  9. sfi/docscan/__init__.py +1 -1
  10. sfi/docscan/docscan.py +407 -103
  11. sfi/docscan/docscan_gui.py +1282 -596
  12. sfi/docscan/lang/eng.py +152 -0
  13. sfi/docscan/lang/zhcn.py +170 -0
  14. sfi/filedate/filedate.py +185 -112
  15. sfi/gittool/__init__.py +2 -0
  16. sfi/gittool/gittool.py +401 -0
  17. sfi/llmclient/llmclient.py +592 -0
  18. sfi/llmquantize/llmquantize.py +480 -0
  19. sfi/llmserver/llmserver.py +335 -0
  20. sfi/makepython/makepython.py +31 -30
  21. sfi/pdfsplit/pdfsplit.py +173 -173
  22. sfi/pyarchive/pyarchive.py +418 -0
  23. sfi/pyembedinstall/pyembedinstall.py +629 -0
  24. sfi/pylibpack/__init__.py +0 -0
  25. sfi/pylibpack/pylibpack.py +1457 -0
  26. sfi/pylibpack/rules/numpy.json +22 -0
  27. sfi/pylibpack/rules/pymupdf.json +10 -0
  28. sfi/pylibpack/rules/pyqt5.json +19 -0
  29. sfi/pylibpack/rules/pyside2.json +23 -0
  30. sfi/pylibpack/rules/scipy.json +23 -0
  31. sfi/pylibpack/rules/shiboken2.json +24 -0
  32. sfi/pyloadergen/pyloadergen.py +512 -227
  33. sfi/pypack/__init__.py +0 -0
  34. sfi/pypack/pypack.py +1142 -0
  35. sfi/pyprojectparse/__init__.py +0 -0
  36. sfi/pyprojectparse/pyprojectparse.py +500 -0
  37. sfi/pysourcepack/pysourcepack.py +308 -0
  38. sfi/quizbase/__init__.py +0 -0
  39. sfi/quizbase/quizbase.py +828 -0
  40. sfi/quizbase/quizbase_gui.py +987 -0
  41. sfi/regexvalidate/__init__.py +0 -0
  42. sfi/regexvalidate/regex_help.html +284 -0
  43. sfi/regexvalidate/regexvalidate.py +468 -0
  44. sfi/taskkill/taskkill.py +0 -2
  45. sfi/workflowengine/__init__.py +0 -0
  46. sfi/workflowengine/workflowengine.py +444 -0
  47. pysfi-0.1.7.dist-info/RECORD +0 -31
  48. pysfi-0.1.7.dist-info/entry_points.txt +0 -15
  49. sfi/embedinstall/embedinstall.py +0 -418
  50. sfi/projectparse/projectparse.py +0 -152
  51. sfi/pypacker/fspacker.py +0 -91
  52. {pysfi-0.1.7.dist-info → pysfi-0.1.11.dist-info}/WHEEL +0 -0
  53. /sfi/{embedinstall → docscan/lang}/__init__.py +0 -0
  54. /sfi/{projectparse → llmquantize}/__init__.py +0 -0
  55. /sfi/{pypacker → pyembedinstall}/__init__.py +0 -0
sfi/docscan/docscan.py CHANGED
@@ -3,11 +3,13 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import argparse
6
+ import contextlib
6
7
  import csv
7
8
  import html
8
9
  import json
9
10
  import logging
10
11
  import re
12
+ import sys
11
13
  import threading
12
14
  import time
13
15
  import xml.etree.ElementTree as ET
@@ -62,6 +64,42 @@ try:
62
64
  except ImportError:
63
65
  pypdf = None
64
66
 
67
+ # Language support imports
68
+ try:
69
+ from sfi.docscan.lang.eng import ENGLISH_DEFAULTS as EN_TRANSLATIONS
70
+ from sfi.docscan.lang.zhcn import TRANSLATIONS as ZH_TRANSLATIONS
71
+ except ImportError:
72
+ try:
73
+ from lang.eng import ENGLISH_DEFAULTS as EN_TRANSLATIONS
74
+ from lang.zhcn import TRANSLATIONS as ZH_TRANSLATIONS
75
+ except ImportError:
76
+ # Fallback translations if import fails
77
+ ZH_TRANSLATIONS = {}
78
+ EN_TRANSLATIONS = {}
79
+
80
+ # Global language setting
81
+ USE_CHINESE = True # Default to Chinese
82
+
83
+
84
+ def t(key: str, **kwargs) -> str:
85
+ """Get translated text for the given key.
86
+
87
+ Args:
88
+ key: Translation key
89
+ **kwargs: Arguments for string formatting
90
+
91
+ Returns:
92
+ Translated text
93
+ """
94
+ text = ZH_TRANSLATIONS.get(key, key) if USE_CHINESE else EN_TRANSLATIONS.get(key, key)
95
+
96
+ # Format with kwargs if provided
97
+ if kwargs:
98
+ with contextlib.suppress(KeyError, ValueError):
99
+ text = text.format(**kwargs)
100
+ return text
101
+
102
+
65
103
  logging.basicConfig(level=logging.INFO, format="%(message)s")
66
104
  logger = logging.getLogger(__name__)
67
105
  cwd = Path.cwd()
@@ -85,7 +123,7 @@ class Rule:
85
123
  # Use re.ASCII for faster matching when possible
86
124
  self.compiled_pattern = re.compile(self.pattern, flags | re.ASCII)
87
125
  except re.error as e:
88
- logger.warning(f"Invalid regex pattern '{self.pattern}': {e}")
126
+ logger.warning(t("invalid_regex_pattern", pattern=self.pattern, error=e))
89
127
  self.compiled_pattern = None
90
128
  else:
91
129
  self.compiled_pattern = None
@@ -173,6 +211,7 @@ class DocumentScanner:
173
211
  self.paused_event.set() # Initially not paused
174
212
  self.stopped = False
175
213
  self._progress_callback = None
214
+ self._executor = None # Keep reference to executor for forced shutdown
176
215
 
177
216
  def set_progress_callback(self, callback: Callable[[int, int], None]) -> None:
178
217
  """Set callback function for progress updates.
@@ -186,19 +225,18 @@ class DocumentScanner:
186
225
  """Pause the scanning process."""
187
226
  self.paused = True
188
227
  self.paused_event.clear()
189
- logger.info("Scan paused")
190
228
 
191
229
  def resume(self) -> None:
192
230
  """Resume the scanning process."""
193
231
  self.paused = False
194
232
  self.paused_event.set()
195
- logger.info("Scan resumed")
233
+ logger.info(t("scan_resumed"))
196
234
 
197
235
  def stop(self) -> None:
198
236
  """Stop the scanning process."""
199
237
  self.stopped = True
200
238
  self.paused_event.set() # Ensure thread can exit
201
- logger.info("Scan stopped")
239
+ logger.info(t("scan_stopped"))
202
240
 
203
241
  def is_paused(self) -> bool:
204
242
  """Check if the scanner is paused."""
@@ -222,9 +260,9 @@ class DocumentScanner:
222
260
  self.paused = False
223
261
  self.paused_event.set()
224
262
 
225
- logger.info(f"Scanning directory: {self.input_dir}")
263
+ logger.info(t("scanning_directory", directory=str(self.input_dir)))
226
264
  files = self._collect_files()
227
- logger.info(f"Found {len(files)} files to scan")
265
+ logger.info(t("found_files_to_scan", count=len(files)))
228
266
 
229
267
  results = {
230
268
  "scan_info": {
@@ -243,53 +281,117 @@ class DocumentScanner:
243
281
  # Scan files in parallel
244
282
  processed = 0
245
283
  executor_class = ProcessPoolExecutor if self.use_process_pool else ThreadPoolExecutor
246
- with executor_class(max_workers=threads) as executor:
247
- future_to_file = {executor.submit(self._scan_file_with_pause_check, file): file for file in files}
284
+ executor = executor_class(max_workers=threads)
285
+ self._executor = executor # Keep reference for forced shutdown
248
286
 
249
- for future in as_completed(future_to_file):
287
+ try:
288
+ # Submit futures one by one to respect pause state
289
+ submitted_futures = []
290
+ was_paused = False # Track previous pause state
291
+
292
+ for file in files:
293
+ # Check if stopped before submitting more tasks
294
+ if self.stopped:
295
+ logger.info(t("scan_stopped_before_submitting_tasks"))
296
+ break
297
+
298
+ # Wait if paused before submitting new tasks
299
+ while self.paused:
300
+ # Log when entering paused state
301
+ if not was_paused:
302
+ logger.info(t("scan_paused"))
303
+ was_paused = True
304
+
305
+ self.paused_event.wait(0.1)
306
+ if self.stopped:
307
+ logger.info(t("scan_stopped_while_paused"))
308
+ break
309
+
310
+ # Log when exiting paused state
311
+ if was_paused and not self.paused:
312
+ logger.info(t("scan_resumed"))
313
+ was_paused = False
314
+
315
+ if self.stopped:
316
+ break
317
+
318
+ future = executor.submit(self._scan_file_with_pause_check, file)
319
+ submitted_futures.append(future)
320
+
321
+ # Process completed futures
322
+ for future in as_completed(submitted_futures):
250
323
  # Check if stopped before processing this future
251
324
  if self.stopped:
252
- logger.info("Scan stopped by user, cancelling remaining tasks...")
325
+ logger.info(t("scan_stopped_by_user_canceling_tasks"))
253
326
  # Cancel all remaining futures
254
- for f in future_to_file:
327
+ for f in submitted_futures:
255
328
  if not f.done():
256
329
  f.cancel()
257
330
  break
258
331
 
259
- # Wait if paused
332
+ # Wait if paused before processing result
260
333
  while self.paused:
261
- time.sleep(0.1)
334
+ # Log when entering paused state
335
+ if not was_paused:
336
+ logger.info(t("scan_paused"))
337
+ was_paused = True
338
+
339
+ self.paused_event.wait(0.1)
262
340
  if self.stopped:
263
- logger.info("Scan stopped while paused")
341
+ logger.info(t("scan_stopped_while_paused"))
264
342
  break
265
343
 
266
- file_path = future_to_file[future]
344
+ # Log when exiting paused state
345
+ if was_paused and not self.paused:
346
+ logger.info(t("scan_resumed"))
347
+ was_paused = False
348
+
349
+ if self.stopped:
350
+ break
351
+
267
352
  try:
268
- file_result = future.result()
353
+ file_result = future.result(timeout=1.0) # Short timeout to allow quick stop
269
354
  if file_result and file_result["matches"]:
270
355
  results["matches"].append(file_result)
271
- logger.info(f"Found matches in: {file_path.name}")
356
+ logger.info(t("found_matches_in_file", file_name=Path(file_result.get("file_path", "")).name))
357
+ except TimeoutError:
358
+ logger.warning(t("task_timeout_scan_may_be_stopping"))
359
+ if self.stopped:
360
+ break
272
361
  except Exception as e:
273
- logger.error(f"Error scanning {file_path}: {e}")
362
+ if not self.stopped:
363
+ logger.error(t("error_scanning_file", error=e))
274
364
 
275
365
  processed += 1
276
366
 
277
367
  # Report progress
278
368
  if show_progress and processed % 10 == 0:
279
- logger.info(f"Progress: {processed}/{len(files)} files processed")
369
+ logger.info(t("progress_report", processed=processed, total=len(files)))
280
370
 
281
371
  # Call progress callback if set
282
372
  if self._progress_callback:
283
373
  self._progress_callback(processed, len(files))
284
374
 
375
+ finally:
376
+ # Force shutdown if stopped
377
+ if self.stopped and self._executor:
378
+ logger.info(t("force_shutting_down_executor"))
379
+ if sys.version_info >= (3, 9):
380
+ self._executor.shutdown(wait=False, cancel_futures=True)
381
+ else:
382
+ self._executor.shutdown(wait=False)
383
+ else:
384
+ self._executor.shutdown(wait=True)
385
+ self._executor = None
386
+
285
387
  results["scan_info"]["files_with_matches"] = len(results["matches"])
286
388
  results["scan_info"]["files_processed"] = processed
287
389
  results["stopped"] = self.stopped
288
390
 
289
391
  if self.stopped:
290
- logger.info(f"Scan stopped. Processed {processed} files")
392
+ logger.info(t("scan_stopped_processed_files", processed=processed))
291
393
  else:
292
- logger.info(f"Scan complete. Found matches in {len(results['matches'])} files")
394
+ logger.info(t("scan_complete_found_matches", matches_count=len(results["matches"])))
293
395
 
294
396
  return results
295
397
 
@@ -299,24 +401,55 @@ class DocumentScanner:
299
401
  if self.stopped:
300
402
  return {}
301
403
 
404
+ # Check if paused before processing
405
+ while self.paused:
406
+ self.paused_event.wait(0.1)
407
+ if self.stopped:
408
+ return {}
409
+
302
410
  return self._scan_file(file_path)
303
411
 
304
412
  def _collect_files(self) -> list[Path]:
305
413
  """Collect all files matching the specified types."""
306
414
  files = []
415
+ image_extensions = ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]
416
+
307
417
  for ext in self.file_types:
418
+ # If extension is an image format and OCR is not enabled, skip
419
+ if ext.lower() in image_extensions and not self.use_pdf_ocr:
420
+ continue
308
421
  files.extend(self.input_dir.rglob(f"*.{ext.lower()}"))
309
422
  files.extend(self.input_dir.rglob(f"*.{ext.upper()}"))
310
423
  return list(set(files)) # Remove duplicates
311
424
 
312
425
  def _scan_file(self, file_path: Path) -> dict[str, Any]:
313
426
  """Scan a single file and return matches."""
427
+ # Check if stopped before starting
428
+ if self.stopped:
429
+ return {}
430
+
431
+ # Check if paused before starting
432
+ while self.paused:
433
+ self.paused_event.wait(0.1)
434
+ if self.stopped:
435
+ return {}
436
+
314
437
  file_start_time = time.perf_counter()
315
438
  ext = file_path.suffix.lower().lstrip(".")
316
439
  text = ""
317
440
  metadata = {}
318
441
 
319
442
  try:
443
+ # Check if stopped before extraction
444
+ if self.stopped:
445
+ return {}
446
+
447
+ # Check if paused before extraction
448
+ while self.paused:
449
+ self.paused_event.wait(0.1)
450
+ if self.stopped:
451
+ return {}
452
+
320
453
  # Route to appropriate extractor
321
454
  if ext == "pdf":
322
455
  text, metadata = self._extract_pdf(file_path)
@@ -341,11 +474,26 @@ class DocumentScanner:
341
474
  elif ext in ["pptx", "ppt"]:
342
475
  text, metadata = self._extract_pptx(file_path)
343
476
  elif ext in ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]:
344
- text, metadata = self._extract_image(file_path)
477
+ # Only extract images if OCR is enabled
478
+ if self.use_pdf_ocr: # Using the same flag for consistency
479
+ text, metadata = self._extract_image(file_path)
480
+ else:
481
+ return {} # Skip image files if OCR is disabled
345
482
  else:
346
483
  text, metadata = self._extract_text(file_path)
484
+
485
+ # Check if stopped after extraction
486
+ if self.stopped:
487
+ return {}
488
+
489
+ # Check if paused after extraction
490
+ while self.paused:
491
+ self.paused_event.wait(0.1)
492
+ if self.stopped:
493
+ return {}
494
+
347
495
  except Exception as e:
348
- logger.warning(f"Could not extract text from {file_path}: {e}")
496
+ logger.warning(t("could_not_extract_text_from_file", file_path=file_path, error=e))
349
497
  return {}
350
498
 
351
499
  processing_time = time.perf_counter() - file_start_time
@@ -353,9 +501,16 @@ class DocumentScanner:
353
501
  if not text:
354
502
  return {}
355
503
 
356
- # Apply all rules
504
+ # Apply all rules with stop check
357
505
  file_matches = []
358
506
  for rule in self.rules:
507
+ if self.stopped:
508
+ return {}
509
+ # Check if paused before each rule
510
+ while self.paused:
511
+ self.paused_event.wait(0.1)
512
+ if self.stopped:
513
+ return {}
359
514
  rule_matches = rule.search(text)
360
515
  if rule_matches:
361
516
  for match in rule_matches:
@@ -369,7 +524,15 @@ class DocumentScanner:
369
524
  # Add processing time to metadata
370
525
  metadata["processing_time_seconds"] = round(processing_time, 3)
371
526
 
372
- logger.info(f"Processed {file_path.name} ({ext}) in {processing_time:.3f}s - {len(file_matches)} matches found")
527
+ logger.info(
528
+ t(
529
+ "processed_file_info",
530
+ file_name=file_path.name,
531
+ ext=ext,
532
+ time=round(processing_time, 3),
533
+ matches_count=len(file_matches),
534
+ )
535
+ )
373
536
 
374
537
  return {
375
538
  "file_path": str(file_path),
@@ -381,60 +544,102 @@ class DocumentScanner:
381
544
 
382
545
  def _extract_pdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
383
546
  """Extract text from PDF file with fallback."""
547
+ # Try PyMuPDF first (faster)
384
548
  if fitz is not None:
385
- return self._extract_pdf_fitz(file_path)
386
- elif pypdf is not None:
387
- return self._extract_pdf_pypdf(file_path)
388
- else:
389
- logger.warning("No PDF library installed (pymupdf or pypdf)")
390
- return "", {}
549
+ try:
550
+ return self._extract_pdf_fitz(file_path)
551
+ except Exception as e:
552
+ logger.warning(t("pymupdf_failed_for_file", file_name=file_path.name, error=e))
553
+
554
+ # Fallback to pypdf
555
+ if pypdf is not None:
556
+ try:
557
+ return self._extract_pdf_pypdf(file_path)
558
+ except Exception as e:
559
+ logger.error(t("pypdf_also_failed_for_file", file_name=file_path.name, error=e))
560
+ return "", {}
561
+
562
+ logger.warning(t("no_pdf_library_installed"))
563
+ return "", {}
391
564
 
392
565
  def _extract_pdf_fitz(self, file_path: Path) -> tuple[str, dict[str, Any]]:
393
566
  """Extract text from PDF using PyMuPDF (fastest method)."""
394
567
  if not fitz:
395
- logger.warning("PyMuPDF not installed")
568
+ logger.warning(t("pymupdf_not_installed"))
396
569
  return "", {}
397
570
 
398
- doc = fitz.open(str(file_path))
399
- if doc.page_count == 0:
400
- logger.warning(f"No pages found in {file_path}")
401
- return "", {}
402
- if not doc.metadata:
403
- logger.warning(f"No metadata found in {file_path}")
404
- return "", {}
571
+ doc = None
572
+ try:
573
+ doc = fitz.open(str(file_path))
574
+ if doc.page_count == 0:
575
+ logger.warning(t("no_pages_found_in_file", file_path=file_path))
576
+ return "", {}
577
+ if not doc.metadata:
578
+ logger.warning(t("no_metadata_found_in_file", file_path=file_path))
579
+ return "", {}
405
580
 
406
- text_parts = []
407
- metadata = {
408
- "page_count": doc.page_count,
409
- "title": doc.metadata.get("title", ""),
410
- "author": doc.metadata.get("author", ""),
411
- "subject": doc.metadata.get("subject", ""),
412
- "creator": doc.metadata.get("creator", ""),
413
- }
581
+ text_parts = []
582
+ metadata = {
583
+ "page_count": doc.page_count,
584
+ "title": doc.metadata.get("title", ""),
585
+ "author": doc.metadata.get("author", ""),
586
+ "subject": doc.metadata.get("subject", ""),
587
+ "creator": doc.metadata.get("creator", ""),
588
+ }
414
589
 
415
- if self.use_pdf_ocr and pytesseract and Image:
416
- # OCR for image-based PDFs
417
- import io
590
+ if self.use_pdf_ocr and pytesseract and Image:
591
+ # OCR for image-based PDFs
592
+ import io
418
593
 
419
- for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
420
- pix = page.get_pixmap()
421
- img_data = pix.tobytes("png")
422
- image = Image.open(io.BytesIO(img_data))
423
- text = pytesseract.image_to_string(image)
424
- text_parts.append(f"[Page {page_num}]\n{text}")
425
- else:
426
- # Extract text directly (faster)
427
- for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
428
- text = page.get_text()
429
- text_parts.append(f"[Page {page_num}]\n{text}")
594
+ for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
595
+ # Check if stopped before processing each page
596
+ if self.stopped:
597
+ doc.close()
598
+ return "", {}
599
+
600
+ # Check if paused before processing each page
601
+ while self.paused:
602
+ self.paused_event.wait(0.1)
603
+ if self.stopped:
604
+ doc.close()
605
+ return "", {}
606
+
607
+ pix = page.get_pixmap()
608
+ img_data = pix.tobytes("png")
609
+ image = Image.open(io.BytesIO(img_data))
610
+ text = pytesseract.image_to_string(image)
611
+ text_parts.append(f"[Page {page_num}]\n{text}")
612
+ else:
613
+ # Extract text directly (faster)
614
+ for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
615
+ # Check if stopped before processing each page
616
+ if self.stopped:
617
+ doc.close()
618
+ return "", {}
430
619
 
431
- doc.close()
432
- return "\n\n".join(text_parts), metadata
620
+ # Check if paused before processing each page
621
+ while self.paused:
622
+ self.paused_event.wait(0.1)
623
+ if self.stopped:
624
+ doc.close()
625
+ return "", {}
626
+
627
+ text = page.get_text()
628
+ text_parts.append(f"[Page {page_num}]\n{text}")
629
+
630
+ doc.close()
631
+ return "\n\n".join(text_parts), metadata
632
+ except Exception as e:
633
+ if doc:
634
+ doc.close()
635
+ logger.warning(t("pymupdf_error_trying_fallback", file_path=file_path, error=e))
636
+ # Re-raise to trigger fallback to pypdf
637
+ raise
433
638
 
434
639
  def _extract_pdf_pypdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
435
640
  """Extract text from PDF using pypdf (fallback method)."""
436
641
  if not pypdf:
437
- logger.warning("pypdf not installed, skipping PDF extraction")
642
+ logger.warning(t("pypdf_not_installed_skipping_extraction"))
438
643
  return "", {}
439
644
 
440
645
  text_parts = []
@@ -444,7 +649,7 @@ class DocumentScanner:
444
649
  pdf_reader = pypdf.PdfReader(f)
445
650
 
446
651
  if not pdf_reader.metadata:
447
- logger.warning(f"No metadata found in {file_path}")
652
+ logger.warning(t("no_metadata_found_in_file", file_path=file_path))
448
653
  return "", {}
449
654
 
450
655
  metadata = {
@@ -454,11 +659,21 @@ class DocumentScanner:
454
659
  }
455
660
 
456
661
  for page_num, page in enumerate(pdf_reader.pages, 1):
662
+ # Check if stopped before processing each page
663
+ if self.stopped:
664
+ return "", {}
665
+
666
+ # Check if paused before processing each page
667
+ while self.paused:
668
+ self.paused_event.wait(0.1)
669
+ if self.stopped:
670
+ return "", {}
671
+
457
672
  text = page.extract_text()
458
673
  text_parts.append(f"[Page {page_num}]\n{text}")
459
674
 
460
675
  except Exception as e:
461
- logger.warning(f"Error extracting PDF with pypdf: {e}")
676
+ logger.warning(t("error_extracting_pdf_with_pypdf", error=e))
462
677
  return "", {}
463
678
 
464
679
  return "\n\n".join(text_parts), metadata
@@ -466,7 +681,7 @@ class DocumentScanner:
466
681
  def _extract_odt(self, file_path: Path) -> tuple[str, dict[str, Any]]:
467
682
  """Extract text from ODT (OpenDocument Text) file."""
468
683
  if odf_odt is None:
469
- logger.warning("odfpy not installed, skipping ODT extraction")
684
+ logger.warning(t("odfpy_not_installed_skipping_extraction"))
470
685
  return "", {}
471
686
 
472
687
  try:
@@ -479,7 +694,7 @@ class DocumentScanner:
479
694
 
480
695
  return text, metadata
481
696
  except Exception as e:
482
- logger.warning(f"Error extracting ODT: {e}")
697
+ logger.warning(t("error_extracting_odt", error=e))
483
698
  return "", {}
484
699
 
485
700
  def _extract_rtf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
@@ -514,13 +729,13 @@ class DocumentScanner:
514
729
 
515
730
  return text, metadata
516
731
  except Exception as e:
517
- logger.warning(f"Error extracting RTF: {e}")
732
+ logger.warning(t("error_extracting_rtf", error=e))
518
733
  return "", {}
519
734
 
520
735
  def _extract_epub(self, file_path: Path) -> tuple[str, dict[str, Any]]:
521
736
  """Extract text from EPUB (ebook) file."""
522
737
  if ebooklib is None:
523
- logger.warning("ebooklib not installed, skipping EPUB extraction")
738
+ logger.warning(t("ebooklib_not_installed_skipping_extraction"))
524
739
  return "", {}
525
740
 
526
741
  try:
@@ -529,6 +744,16 @@ class DocumentScanner:
529
744
 
530
745
  # Extract text from all items
531
746
  for item in book.get_items():
747
+ # Check if stopped before processing each item
748
+ if self.stopped:
749
+ return "", {}
750
+
751
+ # Check if paused before processing each item
752
+ while self.paused:
753
+ self.paused_event.wait(0.1)
754
+ if self.stopped:
755
+ return "", {}
756
+
532
757
  if item.get_type() == ebooklib.ITEM_DOCUMENT: # pyright: ignore[reportAttributeAccessIssue]
533
758
  # Remove HTML tags
534
759
  html_content = item.get_content().decode("utf-8") # pyright: ignore[reportAttributeAccessIssue]
@@ -546,7 +771,7 @@ class DocumentScanner:
546
771
 
547
772
  return "\n\n".join(text_parts), metadata
548
773
  except Exception as e:
549
- logger.warning(f"Error extracting EPUB: {e}")
774
+ logger.warning(t("error_extracting_epub", error=e))
550
775
  return "", {}
551
776
 
552
777
  def _extract_csv(self, file_path: Path) -> tuple[str, dict[str, Any]]:
@@ -556,6 +781,16 @@ class DocumentScanner:
556
781
  with open(file_path, encoding="utf-8", errors="ignore") as f:
557
782
  reader = csv.reader(f)
558
783
  for row in reader:
784
+ # Check if stopped periodically during row processing
785
+ if self.stopped:
786
+ return "", {}
787
+
788
+ # Check if paused periodically during row processing
789
+ while self.paused:
790
+ self.paused_event.wait(0.1)
791
+ if self.stopped:
792
+ return "", {}
793
+
559
794
  row_text = " | ".join(str(cell) for cell in row)
560
795
  text_parts.append(row_text)
561
796
 
@@ -565,7 +800,7 @@ class DocumentScanner:
565
800
 
566
801
  return "\n".join(text_parts), metadata
567
802
  except Exception as e:
568
- logger.warning(f"Error extracting CSV: {e}")
803
+ logger.warning(t("error_extracting_csv", error=e))
569
804
  return "", {}
570
805
 
571
806
  def _extract_xml(self, file_path: Path) -> tuple[str, dict[str, Any]]:
@@ -585,7 +820,7 @@ class DocumentScanner:
585
820
 
586
821
  return text, metadata
587
822
  except Exception as e:
588
- logger.warning(f"Error extracting XML: {e}")
823
+ logger.warning(t("error_extracting_xml", error=e))
589
824
  return "", {}
590
825
 
591
826
  def _extract_html(self, file_path: Path) -> tuple[str, dict[str, Any]]:
@@ -607,7 +842,7 @@ class DocumentScanner:
607
842
 
608
843
  return text, metadata
609
844
  except Exception as e:
610
- logger.warning(f"Error extracting HTML: {e}")
845
+ logger.warning(t("error_extracting_html", error=e))
611
846
  return "", {}
612
847
 
613
848
  def _extract_markdown(self, file_path: Path) -> tuple[str, dict[str, Any]]:
@@ -634,23 +869,43 @@ class DocumentScanner:
634
869
 
635
870
  return text, metadata
636
871
  except Exception as e:
637
- logger.warning(f"Error extracting Markdown: {e}")
872
+ logger.warning(t("error_extracting_markdown", error=e))
638
873
  return "", {}
639
874
 
640
875
  def _extract_docx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
641
876
  """Extract text from DOCX file."""
642
877
  if Document is None:
643
- logger.warning("python-docx not installed, skipping DOCX extraction")
878
+ logger.warning(t("python_docx_not_installed_skipping_extraction"))
644
879
  return "", {}
645
880
 
646
881
  doc = Document(str(file_path))
647
882
  text_parts = []
648
883
 
649
884
  for paragraph in doc.paragraphs:
885
+ # Check if stopped periodically during paragraph processing
886
+ if self.stopped:
887
+ return "", {}
888
+
889
+ # Check if paused periodically during paragraph processing
890
+ while self.paused:
891
+ self.paused_event.wait(0.1)
892
+ if self.stopped:
893
+ return "", {}
894
+
650
895
  text_parts.append(paragraph.text)
651
896
 
652
897
  # Extract tables
653
898
  for table in doc.tables:
899
+ # Check if stopped before processing each table
900
+ if self.stopped:
901
+ return "", {}
902
+
903
+ # Check if paused before processing each table
904
+ while self.paused:
905
+ self.paused_event.wait(0.1)
906
+ if self.stopped:
907
+ return "", {}
908
+
654
909
  for row in table.rows:
655
910
  row_text = " | ".join(cell.text for cell in row.cells)
656
911
  text_parts.append(row_text)
@@ -665,16 +920,40 @@ class DocumentScanner:
665
920
  def _extract_xlsx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
666
921
  """Extract text from XLSX file."""
667
922
  if load_workbook is None:
668
- logger.warning("openpyxl not installed, skipping XLSX extraction")
923
+ logger.warning(t("openpyxl_not_installed_skipping_extraction"))
669
924
  return "", {}
670
925
 
671
926
  wb = load_workbook(file_path, read_only=True, data_only=True)
672
927
  text_parts = []
673
928
 
674
929
  for sheet_name in wb.sheetnames:
930
+ # Check if stopped before processing each sheet
931
+ if self.stopped:
932
+ wb.close()
933
+ return "", {}
934
+
935
+ # Check if paused before processing each sheet
936
+ while self.paused:
937
+ self.paused_event.wait(0.1)
938
+ if self.stopped:
939
+ wb.close()
940
+ return "", {}
941
+
675
942
  sheet = wb[sheet_name]
676
943
  text_parts.append(f"[Sheet: {sheet_name}]")
677
944
  for row in sheet.iter_rows(values_only=True):
945
+ # Check if stopped periodically during row processing
946
+ if self.stopped:
947
+ wb.close()
948
+ return "", {}
949
+
950
+ # Check if paused periodically during row processing
951
+ while self.paused:
952
+ self.paused_event.wait(0.1)
953
+ if self.stopped:
954
+ wb.close()
955
+ return "", {}
956
+
678
957
  row_text = " | ".join(str(cell) if cell is not None else "" for cell in row)
679
958
  if row_text.strip():
680
959
  text_parts.append(row_text)
@@ -684,6 +963,7 @@ class DocumentScanner:
684
963
  "sheets": wb.sheetnames,
685
964
  }
686
965
 
966
+ wb.close()
687
967
  return "\n".join(text_parts), metadata
688
968
 
689
969
  def _extract_pptx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
@@ -691,13 +971,23 @@ class DocumentScanner:
691
971
  try:
692
972
  from pptx import Presentation
693
973
  except ImportError:
694
- logger.warning("python-pptx not installed, skipping PPTX extraction")
974
+ logger.warning(t("python_pptx_not_installed_skipping_extraction"))
695
975
  return "", {}
696
976
 
697
977
  prs = Presentation(str(file_path))
698
978
  text_parts = []
699
979
 
700
980
  for slide_num, slide in enumerate(prs.slides, 1):
981
+ # Check if stopped before processing each slide
982
+ if self.stopped:
983
+ return "", {}
984
+
985
+ # Check if paused before processing each slide
986
+ while self.paused:
987
+ self.paused_event.wait(0.1)
988
+ if self.stopped:
989
+ return "", {}
990
+
701
991
  text_parts.append(f"[Slide {slide_num}]")
702
992
  for shape in slide.shapes:
703
993
  if hasattr(shape, "text"):
@@ -712,7 +1002,7 @@ class DocumentScanner:
712
1002
  def _extract_image(self, file_path: Path) -> tuple[str, dict[str, Any]]:
713
1003
  """Extract text from image file using OCR."""
714
1004
  if Image is None or pytesseract is None:
715
- logger.warning("PIL or pytesseract not installed, skipping image OCR")
1005
+ logger.warning(t("pillow_or_tesseract_not_installed_skipping_ocr"))
716
1006
  return "", {}
717
1007
 
718
1008
  try:
@@ -727,7 +1017,7 @@ class DocumentScanner:
727
1017
 
728
1018
  return text, metadata
729
1019
  except Exception as e:
730
- logger.warning(f"Could not perform OCR on {file_path}: {e}")
1020
+ logger.warning(t("could_not_perform_ocr_on_file", file_path=file_path, error=e))
731
1021
  return "", {}
732
1022
 
733
1023
  def _extract_text(self, file_path: Path) -> tuple[str, dict[str, Any]]:
@@ -747,36 +1037,50 @@ class DocumentScanner:
747
1037
 
748
1038
  def main():
749
1039
  """Main entry point for document scanner."""
750
- parser = argparse.ArgumentParser(
751
- description="Scan documents and extract text, images, and metadata with high performance"
752
- )
753
- parser.add_argument("input", type=str, nargs="?", default=str(cwd), help="Input directory")
754
- parser.add_argument("-r", "--rules", type=str, default="rules.json", help="Rules file (JSON)")
755
- parser.add_argument("--recursive", action="store_true", help="Scan files recursively")
1040
+ # 首先解析语言参数,但不使用翻译
1041
+ temp_parser = argparse.ArgumentParser(add_help=False)
1042
+ temp_parser.add_argument("--lang", choices=["en", "zh"], default="zh")
1043
+ temp_args, _ = temp_parser.parse_known_args()
1044
+
1045
+ # 设置语言
1046
+ global USE_CHINESE
1047
+ USE_CHINESE = temp_args.lang == "zh"
1048
+
1049
+ parser = argparse.ArgumentParser(description=t("document_scanner_description"))
1050
+ parser.add_argument("input", type=str, nargs="?", default=str(cwd), help=t("input_directory_help"))
1051
+ parser.add_argument("-r", "--rules", type=str, default="rules.json", help=t("rules_file_help"))
1052
+ parser.add_argument("--recursive", action="store_true", help=t("recursive_help"))
756
1053
  parser.add_argument(
757
1054
  "-f",
758
1055
  "--file-types",
759
- help="File types to scan (comma-separated)",
760
- default="pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md",
1056
+ help=t("file_types_help"),
1057
+ default="pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md,jpg,jpeg,png,gif,bmp,tiff",
761
1058
  )
762
- parser.add_argument("--use-pdf-ocr", help="Use PDF OCR for image-based PDFs", action="store_true")
1059
+ parser.add_argument("--use-pdf-ocr", help=t("use_pdf_ocr_help"), action="store_true")
763
1060
  parser.add_argument(
764
1061
  "--use-process-pool",
765
- help="Use process pool instead of thread pool (better for CPU-intensive tasks)",
1062
+ help=t("use_process_pool_help"),
766
1063
  action="store_true",
767
1064
  )
768
1065
  parser.add_argument(
769
1066
  "-b",
770
1067
  "--batch-size",
771
- help="Number of files to process in each batch",
1068
+ help=t("batch_size_help"),
772
1069
  default=50,
773
1070
  type=int,
774
1071
  )
775
- parser.add_argument("-t", "--threads", help="Number of threads for parallel scanning", default=4, type=int)
776
- parser.add_argument("--progress", help="Show progress bar", action="store_true")
777
- parser.add_argument("-v", "--verbose", help="Verbose output", action="store_true")
1072
+ parser.add_argument("-t", "--threads", help=t("threads_help"), default=4, type=int)
1073
+ parser.add_argument("--progress", help=t("progress_help"), action="store_true")
1074
+ parser.add_argument("-v", "--verbose", help=t("verbose_help"), action="store_true")
1075
+
1076
+ # 添加语言参数
1077
+ parser.add_argument("--lang", help=t("language_help"), choices=["en", "zh"], default="zh")
1078
+
778
1079
  args = parser.parse_args()
779
1080
 
1081
+ # 再次确认语言设置(以防万一用户在完整参数中改变了语言)
1082
+ USE_CHINESE = args.lang == "zh"
1083
+
780
1084
  if args.verbose:
781
1085
  logger.setLevel(logging.DEBUG)
782
1086
 
@@ -784,9 +1088,9 @@ def main():
784
1088
  # Validate input directory
785
1089
  input_dir = Path(args.input)
786
1090
  if not input_dir.exists() or not input_dir.is_dir():
787
- logger.error(f"Input directory does not exist: {args.input}")
1091
+ logger.error(t("input_directory_does_not_exist", input_dir=args.input))
788
1092
  return
789
- logger.info(f"Scanning directory: {input_dir}...")
1093
+ logger.info(t("scanning_directory", directory=str(input_dir)))
790
1094
 
791
1095
  # Load rules file
792
1096
  rules_file = Path(args.rules)
@@ -796,15 +1100,15 @@ def main():
796
1100
  if rule_files_in_input_dir:
797
1101
  rules_file = rule_files_in_input_dir[0]
798
1102
  else:
799
- logger.error(f"Rules file does not exist: {args.rules}")
1103
+ logger.error(t("rules_file_does_not_exist_alt", rules_file=args.rules))
800
1104
  return
801
- logger.info(f"Using rules file: {rules_file}")
1105
+ logger.info(t("using_rules_file", rules_file=str(rules_file)))
802
1106
 
803
1107
  try:
804
1108
  with open(rules_file, encoding="utf-8") as f:
805
1109
  rules_data = json.load(f)
806
1110
  except json.JSONDecodeError as e:
807
- logger.error(f"Invalid JSON in rules file: {e}")
1111
+ logger.error(t("invalid_json_in_rules_file", error=e))
808
1112
  return
809
1113
 
810
1114
  # Parse rules
@@ -814,11 +1118,11 @@ def main():
814
1118
  elif isinstance(rules_data, dict) and "rules" in rules_data:
815
1119
  rules = [Rule(rule) for rule in rules_data["rules"]]
816
1120
  else:
817
- logger.error("Invalid rules format. Expected a list or dict with 'rules' key")
1121
+ logger.error(t("invalid_rules_format"))
818
1122
  return
819
1123
 
820
1124
  if not rules:
821
- logger.error("No valid rules found")
1125
+ logger.error(t("no_valid_rules_found"))
822
1126
  return
823
1127
 
824
1128
  # Parse file types
@@ -833,8 +1137,8 @@ def main():
833
1137
  with open(output_file, "w", encoding="utf-8") as f:
834
1138
  json.dump(results, f, indent=2, ensure_ascii=False)
835
1139
 
836
- logger.info(f"Results saved to: {output_file}")
837
- logger.info(f"Total time elapsed: {time.perf_counter() - t0:.2f}s")
1140
+ logger.info(t("results_saved_to", path=str(output_file)))
1141
+ logger.info(t("total_time_elapsed", time=round(time.perf_counter() - t0, 2)))
838
1142
 
839
1143
 
840
1144
  if __name__ == "__main__":