pysfi 0.1.6__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sfi/docscan/docscan.py ADDED
@@ -0,0 +1,1145 @@
1
+ """Scan documents and extract text, images, and metadata with certain rules."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import contextlib
7
+ import csv
8
+ import html
9
+ import json
10
+ import logging
11
+ import re
12
+ import sys
13
+ import threading
14
+ import time
15
+ import xml.etree.ElementTree as ET
16
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
17
+ from datetime import datetime
18
+ from pathlib import Path
19
+ from typing import Any, Callable
20
+
21
+ try:
22
+ import fitz # PyMuPDF
23
+ except ImportError:
24
+ fitz = None
25
+
26
+ try:
27
+ from docx import Document
28
+ except ImportError:
29
+ Document = None
30
+
31
+ try:
32
+ from openpyxl import load_workbook
33
+ except ImportError:
34
+ load_workbook = None
35
+
36
+ try:
37
+ from PIL import Image
38
+ except ImportError:
39
+ Image = None
40
+
41
+ try:
42
+ import pytesseract
43
+ except ImportError:
44
+ pytesseract = None
45
+
46
+ try:
47
+ import odf.opendocument as odf_odt # ODT support
48
+ except ImportError:
49
+ odf_odt = None
50
+
51
+ try:
52
+ import ebooklib # EPUB support
53
+ from ebooklib import epub
54
+ except ImportError:
55
+ ebooklib = None
56
+
57
+ try:
58
+ import markdown # Markdown to text
59
+ except ImportError:
60
+ markdown = None
61
+
62
+ try:
63
+ import pypdf # Alternative PDF library
64
+ except ImportError:
65
+ pypdf = None
66
+
67
+ # Language support imports
68
+ try:
69
+ from sfi.docscan.lang.eng import ENGLISH_DEFAULTS as EN_TRANSLATIONS
70
+ from sfi.docscan.lang.zhcn import TRANSLATIONS as ZH_TRANSLATIONS
71
+ except ImportError:
72
+ try:
73
+ from lang.eng import ENGLISH_DEFAULTS as EN_TRANSLATIONS
74
+ from lang.zhcn import TRANSLATIONS as ZH_TRANSLATIONS
75
+ except ImportError:
76
+ # Fallback translations if import fails
77
+ ZH_TRANSLATIONS = {}
78
+ EN_TRANSLATIONS = {}
79
+
80
+ # Global language setting
81
+ USE_CHINESE = True # Default to Chinese
82
+
83
+
84
+ def t(key: str, **kwargs) -> str:
85
+ """Get translated text for the given key.
86
+
87
+ Args:
88
+ key: Translation key
89
+ **kwargs: Arguments for string formatting
90
+
91
+ Returns:
92
+ Translated text
93
+ """
94
+ text = ZH_TRANSLATIONS.get(key, key) if USE_CHINESE else EN_TRANSLATIONS.get(key, key)
95
+
96
+ # Format with kwargs if provided
97
+ if kwargs:
98
+ with contextlib.suppress(KeyError, ValueError):
99
+ text = text.format(**kwargs)
100
+ return text
101
+
102
+
103
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
104
+ logger = logging.getLogger(__name__)
105
+ cwd = Path.cwd()
106
+
107
+
108
+ class Rule:
109
+ """Represents a scanning rule with optimized pattern matching."""
110
+
111
+ def __init__(self, rule_data: dict[str, Any]):
112
+ """Initialize rule from dictionary."""
113
+ self.name = rule_data.get("name", "")
114
+ self.pattern = rule_data.get("pattern", "")
115
+ self.is_regex = rule_data.get("regex", False)
116
+ self.case_sensitive = rule_data.get("case_sensitive", False)
117
+ self.context_lines = rule_data.get("context_lines", 3)
118
+ self.description = rule_data.get("description", "")
119
+
120
+ if self.is_regex:
121
+ flags = 0 if self.case_sensitive else re.IGNORECASE
122
+ try:
123
+ # Use re.ASCII for faster matching when possible
124
+ self.compiled_pattern = re.compile(self.pattern, flags | re.ASCII)
125
+ except re.error as e:
126
+ logger.warning(t("invalid_regex_pattern", pattern=self.pattern, error=e))
127
+ self.compiled_pattern = None
128
+ else:
129
+ self.compiled_pattern = None
130
+
131
+ def search(self, text: str) -> list[dict[str, Any]]:
132
+ """Search for pattern in text and return matches."""
133
+ if not text or not self.pattern:
134
+ return []
135
+
136
+ matches = []
137
+ lines = text.split("\n")
138
+
139
+ if self.is_regex and self.compiled_pattern:
140
+ # Regex search
141
+ for line_num, line in enumerate(lines, 1):
142
+ for match in self.compiled_pattern.finditer(line):
143
+ matches.append({
144
+ "type": "regex",
145
+ "line_number": line_num,
146
+ "match": match.group(),
147
+ "start": match.start(),
148
+ "end": match.end(),
149
+ "context": self._get_context(lines, line_num - 1),
150
+ })
151
+ else:
152
+ # Simple text search
153
+ search_text = self.pattern if self.case_sensitive else self.pattern.lower()
154
+ for line_num, line in enumerate(lines, 1):
155
+ compare_line = line if self.case_sensitive else line.lower()
156
+ start = 0
157
+ while True:
158
+ pos = compare_line.find(search_text, start)
159
+ if pos == -1:
160
+ break
161
+ matches.append({
162
+ "type": "text",
163
+ "line_number": line_num,
164
+ "match": line[pos : pos + len(self.pattern)],
165
+ "start": pos,
166
+ "end": pos + len(self.pattern),
167
+ "context": self._get_context(lines, line_num - 1),
168
+ })
169
+ start = pos + 1
170
+
171
+ return matches
172
+
173
+ def _get_context(self, lines: list[str], line_index: int) -> list[str]:
174
+ """Get context lines around a match."""
175
+ start = max(0, line_index - self.context_lines)
176
+ end = min(len(lines), line_index + self.context_lines + 1)
177
+ return lines[start:end]
178
+
179
+
180
+ class DocumentScanner:
181
+ """High-performance document scanner with multi-format support."""
182
+
183
+ def __init__(
184
+ self,
185
+ input_dir: Path,
186
+ rules: list[Rule],
187
+ file_types: list[str],
188
+ use_pdf_ocr: bool = False,
189
+ use_process_pool: bool = False,
190
+ batch_size: int = 50,
191
+ ):
192
+ """Initialize scanner with input directory and rules.
193
+
194
+ Args:
195
+ input_dir: Directory containing documents to scan
196
+ rules: List of scanning rules
197
+ file_types: List of file extensions to scan
198
+ use_pdf_ocr: Use OCR for PDF files
199
+ use_process_pool: Use process pool instead of thread pool for CPU-intensive tasks
200
+ batch_size: Number of files to process in each batch
201
+ """
202
+ self.input_dir = Path(input_dir)
203
+ self.rules = rules
204
+ self.file_types = file_types
205
+ self.use_pdf_ocr = use_pdf_ocr
206
+ self.use_process_pool = use_process_pool
207
+ self.batch_size = batch_size
208
+ self.results = []
209
+ self.paused = False
210
+ self.paused_event = threading.Event()
211
+ self.paused_event.set() # Initially not paused
212
+ self.stopped = False
213
+ self._progress_callback = None
214
+ self._executor = None # Keep reference to executor for forced shutdown
215
+
216
+ def set_progress_callback(self, callback: Callable[[int, int], None]) -> None:
217
+ """Set callback function for progress updates.
218
+
219
+ Args:
220
+ callback: Function to call with progress (current, total)
221
+ """
222
+ self._progress_callback = callback
223
+
224
+ def pause(self) -> None:
225
+ """Pause the scanning process."""
226
+ self.paused = True
227
+ self.paused_event.clear()
228
+
229
+ def resume(self) -> None:
230
+ """Resume the scanning process."""
231
+ self.paused = False
232
+ self.paused_event.set()
233
+ logger.info(t("scan_resumed"))
234
+
235
+ def stop(self) -> None:
236
+ """Stop the scanning process."""
237
+ self.stopped = True
238
+ self.paused_event.set() # Ensure thread can exit
239
+ logger.info(t("scan_stopped"))
240
+
241
+ def is_paused(self) -> bool:
242
+ """Check if the scanner is paused."""
243
+ return self.paused
244
+
245
+ def is_stopped(self) -> bool:
246
+ """Check if the scanner is stopped."""
247
+ return self.stopped
248
+
249
+ def scan(self, threads: int = 4, show_progress: bool = False) -> dict[str, Any]:
250
+ """Scan all documents in input directory.
251
+
252
+ Args:
253
+ threads: Number of worker threads/processes
254
+ show_progress: Show progress bar
255
+
256
+ Returns:
257
+ Dictionary containing scan results
258
+ """
259
+ self.stopped = False
260
+ self.paused = False
261
+ self.paused_event.set()
262
+
263
+ logger.info(t("scanning_directory", directory=str(self.input_dir)))
264
+ files = self._collect_files()
265
+ logger.info(t("found_files_to_scan", count=len(files)))
266
+
267
+ results = {
268
+ "scan_info": {
269
+ "input_directory": str(self.input_dir),
270
+ "scan_time": datetime.now().isoformat(),
271
+ "file_types_scanned": self.file_types,
272
+ "total_files": len(files),
273
+ "rules_count": len(self.rules),
274
+ "use_pdf_ocr": self.use_pdf_ocr,
275
+ "use_process_pool": self.use_process_pool,
276
+ },
277
+ "rules": [{"name": r.name, "pattern": r.pattern, "is_regex": r.is_regex} for r in self.rules],
278
+ "matches": [],
279
+ }
280
+
281
+ # Scan files in parallel
282
+ processed = 0
283
+ executor_class = ProcessPoolExecutor if self.use_process_pool else ThreadPoolExecutor
284
+ executor = executor_class(max_workers=threads)
285
+ self._executor = executor # Keep reference for forced shutdown
286
+
287
+ try:
288
+ # Submit futures one by one to respect pause state
289
+ submitted_futures = []
290
+ was_paused = False # Track previous pause state
291
+
292
+ for file in files:
293
+ # Check if stopped before submitting more tasks
294
+ if self.stopped:
295
+ logger.info(t("scan_stopped_before_submitting_tasks"))
296
+ break
297
+
298
+ # Wait if paused before submitting new tasks
299
+ while self.paused:
300
+ # Log when entering paused state
301
+ if not was_paused:
302
+ logger.info(t("scan_paused"))
303
+ was_paused = True
304
+
305
+ self.paused_event.wait(0.1)
306
+ if self.stopped:
307
+ logger.info(t("scan_stopped_while_paused"))
308
+ break
309
+
310
+ # Log when exiting paused state
311
+ if was_paused and not self.paused:
312
+ logger.info(t("scan_resumed"))
313
+ was_paused = False
314
+
315
+ if self.stopped:
316
+ break
317
+
318
+ future = executor.submit(self._scan_file_with_pause_check, file)
319
+ submitted_futures.append(future)
320
+
321
+ # Process completed futures
322
+ for future in as_completed(submitted_futures):
323
+ # Check if stopped before processing this future
324
+ if self.stopped:
325
+ logger.info(t("scan_stopped_by_user_canceling_tasks"))
326
+ # Cancel all remaining futures
327
+ for f in submitted_futures:
328
+ if not f.done():
329
+ f.cancel()
330
+ break
331
+
332
+ # Wait if paused before processing result
333
+ while self.paused:
334
+ # Log when entering paused state
335
+ if not was_paused:
336
+ logger.info(t("scan_paused"))
337
+ was_paused = True
338
+
339
+ self.paused_event.wait(0.1)
340
+ if self.stopped:
341
+ logger.info(t("scan_stopped_while_paused"))
342
+ break
343
+
344
+ # Log when exiting paused state
345
+ if was_paused and not self.paused:
346
+ logger.info(t("scan_resumed"))
347
+ was_paused = False
348
+
349
+ if self.stopped:
350
+ break
351
+
352
+ try:
353
+ file_result = future.result(timeout=1.0) # Short timeout to allow quick stop
354
+ if file_result and file_result["matches"]:
355
+ results["matches"].append(file_result)
356
+ logger.info(t("found_matches_in_file", file_name=Path(file_result.get("file_path", "")).name))
357
+ except TimeoutError:
358
+ logger.warning(t("task_timeout_scan_may_be_stopping"))
359
+ if self.stopped:
360
+ break
361
+ except Exception as e:
362
+ if not self.stopped:
363
+ logger.error(t("error_scanning_file", error=e))
364
+
365
+ processed += 1
366
+
367
+ # Report progress
368
+ if show_progress and processed % 10 == 0:
369
+ logger.info(t("progress_report", processed=processed, total=len(files)))
370
+
371
+ # Call progress callback if set
372
+ if self._progress_callback:
373
+ self._progress_callback(processed, len(files))
374
+
375
+ finally:
376
+ # Force shutdown if stopped
377
+ if self.stopped and self._executor:
378
+ logger.info(t("force_shutting_down_executor"))
379
+ if sys.version_info >= (3, 9):
380
+ self._executor.shutdown(wait=False, cancel_futures=True)
381
+ else:
382
+ self._executor.shutdown(wait=False)
383
+ else:
384
+ self._executor.shutdown(wait=True)
385
+ self._executor = None
386
+
387
+ results["scan_info"]["files_with_matches"] = len(results["matches"])
388
+ results["scan_info"]["files_processed"] = processed
389
+ results["stopped"] = self.stopped
390
+
391
+ if self.stopped:
392
+ logger.info(t("scan_stopped_processed_files", processed=processed))
393
+ else:
394
+ logger.info(t("scan_complete_found_matches", matches_count=len(results["matches"])))
395
+
396
+ return results
397
+
398
+ def _scan_file_with_pause_check(self, file_path: Path) -> dict[str, Any]:
399
+ """Scan a single file with pause check."""
400
+ # Check if stopped before processing
401
+ if self.stopped:
402
+ return {}
403
+
404
+ # Check if paused before processing
405
+ while self.paused:
406
+ self.paused_event.wait(0.1)
407
+ if self.stopped:
408
+ return {}
409
+
410
+ return self._scan_file(file_path)
411
+
412
+ def _collect_files(self) -> list[Path]:
413
+ """Collect all files matching the specified types."""
414
+ files = []
415
+ image_extensions = ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]
416
+
417
+ for ext in self.file_types:
418
+ # If extension is an image format and OCR is not enabled, skip
419
+ if ext.lower() in image_extensions and not self.use_pdf_ocr:
420
+ continue
421
+ files.extend(self.input_dir.rglob(f"*.{ext.lower()}"))
422
+ files.extend(self.input_dir.rglob(f"*.{ext.upper()}"))
423
+ return list(set(files)) # Remove duplicates
424
+
425
+ def _scan_file(self, file_path: Path) -> dict[str, Any]:
426
+ """Scan a single file and return matches."""
427
+ # Check if stopped before starting
428
+ if self.stopped:
429
+ return {}
430
+
431
+ # Check if paused before starting
432
+ while self.paused:
433
+ self.paused_event.wait(0.1)
434
+ if self.stopped:
435
+ return {}
436
+
437
+ file_start_time = time.perf_counter()
438
+ ext = file_path.suffix.lower().lstrip(".")
439
+ text = ""
440
+ metadata = {}
441
+
442
+ try:
443
+ # Check if stopped before extraction
444
+ if self.stopped:
445
+ return {}
446
+
447
+ # Check if paused before extraction
448
+ while self.paused:
449
+ self.paused_event.wait(0.1)
450
+ if self.stopped:
451
+ return {}
452
+
453
+ # Route to appropriate extractor
454
+ if ext == "pdf":
455
+ text, metadata = self._extract_pdf(file_path)
456
+ elif ext == "odt":
457
+ text, metadata = self._extract_odt(file_path)
458
+ elif ext == "rtf":
459
+ text, metadata = self._extract_rtf(file_path)
460
+ elif ext == "epub":
461
+ text, metadata = self._extract_epub(file_path)
462
+ elif ext == "csv":
463
+ text, metadata = self._extract_csv(file_path)
464
+ elif ext == "xml":
465
+ text, metadata = self._extract_xml(file_path)
466
+ elif ext == "html" or ext == "htm":
467
+ text, metadata = self._extract_html(file_path)
468
+ elif ext == "md":
469
+ text, metadata = self._extract_markdown(file_path)
470
+ elif ext in ["docx", "doc"]:
471
+ text, metadata = self._extract_docx(file_path)
472
+ elif ext in ["xlsx", "xls"]:
473
+ text, metadata = self._extract_xlsx(file_path)
474
+ elif ext in ["pptx", "ppt"]:
475
+ text, metadata = self._extract_pptx(file_path)
476
+ elif ext in ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]:
477
+ # Only extract images if OCR is enabled
478
+ if self.use_pdf_ocr: # Using the same flag for consistency
479
+ text, metadata = self._extract_image(file_path)
480
+ else:
481
+ return {} # Skip image files if OCR is disabled
482
+ else:
483
+ text, metadata = self._extract_text(file_path)
484
+
485
+ # Check if stopped after extraction
486
+ if self.stopped:
487
+ return {}
488
+
489
+ # Check if paused after extraction
490
+ while self.paused:
491
+ self.paused_event.wait(0.1)
492
+ if self.stopped:
493
+ return {}
494
+
495
+ except Exception as e:
496
+ logger.warning(t("could_not_extract_text_from_file", file_path=file_path, error=e))
497
+ return {}
498
+
499
+ processing_time = time.perf_counter() - file_start_time
500
+
501
+ if not text:
502
+ return {}
503
+
504
+ # Apply all rules with stop check
505
+ file_matches = []
506
+ for rule in self.rules:
507
+ if self.stopped:
508
+ return {}
509
+ # Check if paused before each rule
510
+ while self.paused:
511
+ self.paused_event.wait(0.1)
512
+ if self.stopped:
513
+ return {}
514
+ rule_matches = rule.search(text)
515
+ if rule_matches:
516
+ for match in rule_matches:
517
+ match["rule_name"] = rule.name
518
+ match["rule_description"] = rule.description
519
+ file_matches.extend(rule_matches)
520
+
521
+ if not file_matches:
522
+ return {}
523
+
524
+ # Add processing time to metadata
525
+ metadata["processing_time_seconds"] = round(processing_time, 3)
526
+
527
+ logger.info(
528
+ t(
529
+ "processed_file_info",
530
+ file_name=file_path.name,
531
+ ext=ext,
532
+ time=round(processing_time, 3),
533
+ matches_count=len(file_matches),
534
+ )
535
+ )
536
+
537
+ return {
538
+ "file_path": str(file_path),
539
+ "file_type": ext,
540
+ "file_size": file_path.stat().st_size,
541
+ "metadata": metadata,
542
+ "matches": file_matches,
543
+ }
544
+
545
+ def _extract_pdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
546
+ """Extract text from PDF file with fallback."""
547
+ # Try PyMuPDF first (faster)
548
+ if fitz is not None:
549
+ try:
550
+ return self._extract_pdf_fitz(file_path)
551
+ except Exception as e:
552
+ logger.warning(t("pymupdf_failed_for_file", file_name=file_path.name, error=e))
553
+
554
+ # Fallback to pypdf
555
+ if pypdf is not None:
556
+ try:
557
+ return self._extract_pdf_pypdf(file_path)
558
+ except Exception as e:
559
+ logger.error(t("pypdf_also_failed_for_file", file_name=file_path.name, error=e))
560
+ return "", {}
561
+
562
+ logger.warning(t("no_pdf_library_installed"))
563
+ return "", {}
564
+
565
+ def _extract_pdf_fitz(self, file_path: Path) -> tuple[str, dict[str, Any]]:
566
+ """Extract text from PDF using PyMuPDF (fastest method)."""
567
+ if not fitz:
568
+ logger.warning(t("pymupdf_not_installed"))
569
+ return "", {}
570
+
571
+ doc = None
572
+ try:
573
+ doc = fitz.open(str(file_path))
574
+ if doc.page_count == 0:
575
+ logger.warning(t("no_pages_found_in_file", file_path=file_path))
576
+ return "", {}
577
+ if not doc.metadata:
578
+ logger.warning(t("no_metadata_found_in_file", file_path=file_path))
579
+ return "", {}
580
+
581
+ text_parts = []
582
+ metadata = {
583
+ "page_count": doc.page_count,
584
+ "title": doc.metadata.get("title", ""),
585
+ "author": doc.metadata.get("author", ""),
586
+ "subject": doc.metadata.get("subject", ""),
587
+ "creator": doc.metadata.get("creator", ""),
588
+ }
589
+
590
+ if self.use_pdf_ocr and pytesseract and Image:
591
+ # OCR for image-based PDFs
592
+ import io
593
+
594
+ for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
595
+ # Check if stopped before processing each page
596
+ if self.stopped:
597
+ doc.close()
598
+ return "", {}
599
+
600
+ # Check if paused before processing each page
601
+ while self.paused:
602
+ self.paused_event.wait(0.1)
603
+ if self.stopped:
604
+ doc.close()
605
+ return "", {}
606
+
607
+ pix = page.get_pixmap()
608
+ img_data = pix.tobytes("png")
609
+ image = Image.open(io.BytesIO(img_data))
610
+ text = pytesseract.image_to_string(image)
611
+ text_parts.append(f"[Page {page_num}]\n{text}")
612
+ else:
613
+ # Extract text directly (faster)
614
+ for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
615
+ # Check if stopped before processing each page
616
+ if self.stopped:
617
+ doc.close()
618
+ return "", {}
619
+
620
+ # Check if paused before processing each page
621
+ while self.paused:
622
+ self.paused_event.wait(0.1)
623
+ if self.stopped:
624
+ doc.close()
625
+ return "", {}
626
+
627
+ text = page.get_text()
628
+ text_parts.append(f"[Page {page_num}]\n{text}")
629
+
630
+ doc.close()
631
+ return "\n\n".join(text_parts), metadata
632
+ except Exception as e:
633
+ if doc:
634
+ doc.close()
635
+ logger.warning(t("pymupdf_error_trying_fallback", file_path=file_path, error=e))
636
+ # Re-raise to trigger fallback to pypdf
637
+ raise
638
+
639
+ def _extract_pdf_pypdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
640
+ """Extract text from PDF using pypdf (fallback method)."""
641
+ if not pypdf:
642
+ logger.warning(t("pypdf_not_installed_skipping_extraction"))
643
+ return "", {}
644
+
645
+ text_parts = []
646
+ metadata = {}
647
+ try:
648
+ with open(file_path, "rb") as f:
649
+ pdf_reader = pypdf.PdfReader(f)
650
+
651
+ if not pdf_reader.metadata:
652
+ logger.warning(t("no_metadata_found_in_file", file_path=file_path))
653
+ return "", {}
654
+
655
+ metadata = {
656
+ "page_count": len(pdf_reader.pages),
657
+ "title": pdf_reader.metadata.get("/Title", ""),
658
+ "author": pdf_reader.metadata.get("/Author", ""),
659
+ }
660
+
661
+ for page_num, page in enumerate(pdf_reader.pages, 1):
662
+ # Check if stopped before processing each page
663
+ if self.stopped:
664
+ return "", {}
665
+
666
+ # Check if paused before processing each page
667
+ while self.paused:
668
+ self.paused_event.wait(0.1)
669
+ if self.stopped:
670
+ return "", {}
671
+
672
+ text = page.extract_text()
673
+ text_parts.append(f"[Page {page_num}]\n{text}")
674
+
675
+ except Exception as e:
676
+ logger.warning(t("error_extracting_pdf_with_pypdf", error=e))
677
+ return "", {}
678
+
679
+ return "\n\n".join(text_parts), metadata
680
+
681
+ def _extract_odt(self, file_path: Path) -> tuple[str, dict[str, Any]]:
682
+ """Extract text from ODT (OpenDocument Text) file."""
683
+ if odf_odt is None:
684
+ logger.warning(t("odfpy_not_installed_skipping_extraction"))
685
+ return "", {}
686
+
687
+ try:
688
+ doc = odf_odt.load(file_path)
689
+ text = doc.textual_content # pyright: ignore[reportAttributeAccessIssue]
690
+
691
+ metadata = {
692
+ "format": "ODT",
693
+ }
694
+
695
+ return text, metadata
696
+ except Exception as e:
697
+ logger.warning(t("error_extracting_odt", error=e))
698
+ return "", {}
699
+
700
+ def _extract_rtf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
701
+ """Extract text from RTF (Rich Text Format) file."""
702
+ try:
703
+ with open(file_path, "rb") as f:
704
+ content = f.read()
705
+
706
+ # Simple RTF text extraction (removes control words)
707
+ text = ""
708
+ i = 0
709
+ while i < len(content):
710
+ if content[i] == ord("\\") and i + 1 < len(content):
711
+ if content[i + 1] in [ord("'"), ord("*"), ord("\\")]:
712
+ i += 2
713
+ continue
714
+ # Skip control words
715
+ while (
716
+ i < len(content)
717
+ and content[i] != ord(" ")
718
+ and content[i] != ord("{")
719
+ and content[i] != ord("}")
720
+ ):
721
+ i += 1
722
+ elif content[i] >= 32 and content[i] <= 126: # Printable ASCII
723
+ text += chr(content[i])
724
+ i += 1
725
+
726
+ metadata = {
727
+ "format": "RTF",
728
+ }
729
+
730
+ return text, metadata
731
+ except Exception as e:
732
+ logger.warning(t("error_extracting_rtf", error=e))
733
+ return "", {}
734
+
735
+ def _extract_epub(self, file_path: Path) -> tuple[str, dict[str, Any]]:
736
+ """Extract text from EPUB (ebook) file."""
737
+ if ebooklib is None:
738
+ logger.warning(t("ebooklib_not_installed_skipping_extraction"))
739
+ return "", {}
740
+
741
+ try:
742
+ book = epub.read_epub(file_path)
743
+ text_parts = []
744
+
745
+ # Extract text from all items
746
+ for item in book.get_items():
747
+ # Check if stopped before processing each item
748
+ if self.stopped:
749
+ return "", {}
750
+
751
+ # Check if paused before processing each item
752
+ while self.paused:
753
+ self.paused_event.wait(0.1)
754
+ if self.stopped:
755
+ return "", {}
756
+
757
+ if item.get_type() == ebooklib.ITEM_DOCUMENT: # pyright: ignore[reportAttributeAccessIssue]
758
+ # Remove HTML tags
759
+ html_content = item.get_content().decode("utf-8") # pyright: ignore[reportAttributeAccessIssue]
760
+ import re
761
+
762
+ text = re.sub(r"<[^>]+>", " ", html_content)
763
+ text = html.unescape(text)
764
+ text_parts.append(text)
765
+
766
+ metadata = {
767
+ "title": book.get_metadata("DC", "title")[0][0] if book.get_metadata("DC", "title") else "", # pyright: ignore[reportAttributeAccessIssue]
768
+ "author": book.get_metadata("DC", "creator")[0][0] if book.get_metadata("DC", "creator") else "", # pyright: ignore[reportAttributeAccessIssue]
769
+ "format": "EPUB",
770
+ }
771
+
772
+ return "\n\n".join(text_parts), metadata
773
+ except Exception as e:
774
+ logger.warning(t("error_extracting_epub", error=e))
775
+ return "", {}
776
+
777
+ def _extract_csv(self, file_path: Path) -> tuple[str, dict[str, Any]]:
778
+ """Extract text from CSV file."""
779
+ try:
780
+ text_parts = []
781
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
782
+ reader = csv.reader(f)
783
+ for row in reader:
784
+ # Check if stopped periodically during row processing
785
+ if self.stopped:
786
+ return "", {}
787
+
788
+ # Check if paused periodically during row processing
789
+ while self.paused:
790
+ self.paused_event.wait(0.1)
791
+ if self.stopped:
792
+ return "", {}
793
+
794
+ row_text = " | ".join(str(cell) for cell in row)
795
+ text_parts.append(row_text)
796
+
797
+ metadata = {
798
+ "format": "CSV",
799
+ }
800
+
801
+ return "\n".join(text_parts), metadata
802
+ except Exception as e:
803
+ logger.warning(t("error_extracting_csv", error=e))
804
+ return "", {}
805
+
806
+ def _extract_xml(self, file_path: Path) -> tuple[str, dict[str, Any]]:
807
+ """Extract text from XML file."""
808
+ try:
809
+ tree = ET.parse(file_path)
810
+ root = tree.getroot()
811
+
812
+ # Extract all text content
813
+ text_parts = [elem.text for elem in root.iter() if elem.text and elem.text.strip()]
814
+ text = "\n".join(text_parts)
815
+
816
+ metadata = {
817
+ "format": "XML",
818
+ "root_tag": root.tag,
819
+ }
820
+
821
+ return text, metadata
822
+ except Exception as e:
823
+ logger.warning(t("error_extracting_xml", error=e))
824
+ return "", {}
825
+
826
+ def _extract_html(self, file_path: Path) -> tuple[str, dict[str, Any]]:
827
+ """Extract text from HTML file."""
828
+ try:
829
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
830
+ html_content = f.read()
831
+
832
+ # Remove HTML tags
833
+ import re
834
+
835
+ text = re.sub(r"<[^>]+>", " ", html_content)
836
+ text = html.unescape(text)
837
+ text = re.sub(r"\s+", " ", text).strip()
838
+
839
+ metadata = {
840
+ "format": "HTML",
841
+ }
842
+
843
+ return text, metadata
844
+ except Exception as e:
845
+ logger.warning(t("error_extracting_html", error=e))
846
+ return "", {}
847
+
848
+ def _extract_markdown(self, file_path: Path) -> tuple[str, dict[str, Any]]:
849
+ """Extract text from Markdown file."""
850
+ try:
851
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
852
+ content = f.read()
853
+
854
+ if markdown:
855
+ # Convert Markdown to HTML then extract text
856
+ html_content = markdown.markdown(content) # pyright: ignore[reportAttributeAccessIssue]
857
+ import re
858
+
859
+ text = re.sub(r"<[^>]+>", " ", html_content)
860
+ text = html.unescape(text)
861
+ text = re.sub(r"\s+", " ", text).strip()
862
+ else:
863
+ # Simple Markdown processing
864
+ text = content
865
+
866
+ metadata = {
867
+ "format": "Markdown",
868
+ }
869
+
870
+ return text, metadata
871
+ except Exception as e:
872
+ logger.warning(t("error_extracting_markdown", error=e))
873
+ return "", {}
874
+
875
+ def _extract_docx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
876
+ """Extract text from DOCX file."""
877
+ if Document is None:
878
+ logger.warning(t("python_docx_not_installed_skipping_extraction"))
879
+ return "", {}
880
+
881
+ doc = Document(str(file_path))
882
+ text_parts = []
883
+
884
+ for paragraph in doc.paragraphs:
885
+ # Check if stopped periodically during paragraph processing
886
+ if self.stopped:
887
+ return "", {}
888
+
889
+ # Check if paused periodically during paragraph processing
890
+ while self.paused:
891
+ self.paused_event.wait(0.1)
892
+ if self.stopped:
893
+ return "", {}
894
+
895
+ text_parts.append(paragraph.text)
896
+
897
+ # Extract tables
898
+ for table in doc.tables:
899
+ # Check if stopped before processing each table
900
+ if self.stopped:
901
+ return "", {}
902
+
903
+ # Check if paused before processing each table
904
+ while self.paused:
905
+ self.paused_event.wait(0.1)
906
+ if self.stopped:
907
+ return "", {}
908
+
909
+ for row in table.rows:
910
+ row_text = " | ".join(cell.text for cell in row.cells)
911
+ text_parts.append(row_text)
912
+
913
+ metadata = {
914
+ "paragraph_count": len(doc.paragraphs),
915
+ "table_count": len(doc.tables),
916
+ }
917
+
918
+ return "\n".join(text_parts), metadata
919
+
920
+ def _extract_xlsx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
921
+ """Extract text from XLSX file."""
922
+ if load_workbook is None:
923
+ logger.warning(t("openpyxl_not_installed_skipping_extraction"))
924
+ return "", {}
925
+
926
+ wb = load_workbook(file_path, read_only=True, data_only=True)
927
+ text_parts = []
928
+
929
+ for sheet_name in wb.sheetnames:
930
+ # Check if stopped before processing each sheet
931
+ if self.stopped:
932
+ wb.close()
933
+ return "", {}
934
+
935
+ # Check if paused before processing each sheet
936
+ while self.paused:
937
+ self.paused_event.wait(0.1)
938
+ if self.stopped:
939
+ wb.close()
940
+ return "", {}
941
+
942
+ sheet = wb[sheet_name]
943
+ text_parts.append(f"[Sheet: {sheet_name}]")
944
+ for row in sheet.iter_rows(values_only=True):
945
+ # Check if stopped periodically during row processing
946
+ if self.stopped:
947
+ wb.close()
948
+ return "", {}
949
+
950
+ # Check if paused periodically during row processing
951
+ while self.paused:
952
+ self.paused_event.wait(0.1)
953
+ if self.stopped:
954
+ wb.close()
955
+ return "", {}
956
+
957
+ row_text = " | ".join(str(cell) if cell is not None else "" for cell in row)
958
+ if row_text.strip():
959
+ text_parts.append(row_text)
960
+
961
+ metadata = {
962
+ "sheet_count": len(wb.sheetnames),
963
+ "sheets": wb.sheetnames,
964
+ }
965
+
966
+ wb.close()
967
+ return "\n".join(text_parts), metadata
968
+
969
+ def _extract_pptx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
970
+ """Extract text from PPTX file."""
971
+ try:
972
+ from pptx import Presentation
973
+ except ImportError:
974
+ logger.warning(t("python_pptx_not_installed_skipping_extraction"))
975
+ return "", {}
976
+
977
+ prs = Presentation(str(file_path))
978
+ text_parts = []
979
+
980
+ for slide_num, slide in enumerate(prs.slides, 1):
981
+ # Check if stopped before processing each slide
982
+ if self.stopped:
983
+ return "", {}
984
+
985
+ # Check if paused before processing each slide
986
+ while self.paused:
987
+ self.paused_event.wait(0.1)
988
+ if self.stopped:
989
+ return "", {}
990
+
991
+ text_parts.append(f"[Slide {slide_num}]")
992
+ for shape in slide.shapes:
993
+ if hasattr(shape, "text"):
994
+ text_parts.append(shape.text) # pyright: ignore[reportAttributeAccessIssue]
995
+
996
+ metadata = {
997
+ "slide_count": len(prs.slides),
998
+ }
999
+
1000
+ return "\n".join(text_parts), metadata
1001
+
1002
+ def _extract_image(self, file_path: Path) -> tuple[str, dict[str, Any]]:
1003
+ """Extract text from image file using OCR."""
1004
+ if Image is None or pytesseract is None:
1005
+ logger.warning(t("pillow_or_tesseract_not_installed_skipping_ocr"))
1006
+ return "", {}
1007
+
1008
+ try:
1009
+ img = Image.open(file_path)
1010
+ text = pytesseract.image_to_string(img)
1011
+
1012
+ metadata = {
1013
+ "format": img.format,
1014
+ "mode": img.mode,
1015
+ "size": img.size,
1016
+ }
1017
+
1018
+ return text, metadata
1019
+ except Exception as e:
1020
+ logger.warning(t("could_not_perform_ocr_on_file", file_path=file_path, error=e))
1021
+ return "", {}
1022
+
1023
+ def _extract_text(self, file_path: Path) -> tuple[str, dict[str, Any]]:
1024
+ """Extract text from plain text file."""
1025
+ encodings = ["utf-8", "latin-1", "cp1252", "utf-16"]
1026
+
1027
+ for encoding in encodings:
1028
+ try:
1029
+ with open(file_path, encoding=encoding, errors="ignore") as f:
1030
+ text = f.read()
1031
+ return text, {"encoding": encoding}
1032
+ except UnicodeDecodeError:
1033
+ continue
1034
+
1035
+ return "", {}
1036
+
1037
+
1038
+ def main():
1039
+ """Main entry point for document scanner."""
1040
+ # 首先解析语言参数,但不使用翻译
1041
+ temp_parser = argparse.ArgumentParser(add_help=False)
1042
+ temp_parser.add_argument("--lang", choices=["en", "zh"], default="zh")
1043
+ temp_args, _ = temp_parser.parse_known_args()
1044
+
1045
+ # 设置语言
1046
+ global USE_CHINESE
1047
+ USE_CHINESE = temp_args.lang == "zh"
1048
+
1049
+ parser = argparse.ArgumentParser(description=t("document_scanner_description"))
1050
+ parser.add_argument("input", type=str, nargs="?", default=str(cwd), help=t("input_directory_help"))
1051
+ parser.add_argument("-r", "--rules", type=str, default="rules.json", help=t("rules_file_help"))
1052
+ parser.add_argument("--recursive", action="store_true", help=t("recursive_help"))
1053
+ parser.add_argument(
1054
+ "-f",
1055
+ "--file-types",
1056
+ help=t("file_types_help"),
1057
+ default="pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md,jpg,jpeg,png,gif,bmp,tiff",
1058
+ )
1059
+ parser.add_argument("--use-pdf-ocr", help=t("use_pdf_ocr_help"), action="store_true")
1060
+ parser.add_argument(
1061
+ "--use-process-pool",
1062
+ help=t("use_process_pool_help"),
1063
+ action="store_true",
1064
+ )
1065
+ parser.add_argument(
1066
+ "-b",
1067
+ "--batch-size",
1068
+ help=t("batch_size_help"),
1069
+ default=50,
1070
+ type=int,
1071
+ )
1072
+ parser.add_argument("-t", "--threads", help=t("threads_help"), default=4, type=int)
1073
+ parser.add_argument("--progress", help=t("progress_help"), action="store_true")
1074
+ parser.add_argument("-v", "--verbose", help=t("verbose_help"), action="store_true")
1075
+
1076
+ # 添加语言参数
1077
+ parser.add_argument("--lang", help=t("language_help"), choices=["en", "zh"], default="zh")
1078
+
1079
+ args = parser.parse_args()
1080
+
1081
+ # 再次确认语言设置(以防万一用户在完整参数中改变了语言)
1082
+ USE_CHINESE = args.lang == "zh"
1083
+
1084
+ if args.verbose:
1085
+ logger.setLevel(logging.DEBUG)
1086
+
1087
+ t0 = time.perf_counter()
1088
+ # Validate input directory
1089
+ input_dir = Path(args.input)
1090
+ if not input_dir.exists() or not input_dir.is_dir():
1091
+ logger.error(t("input_directory_does_not_exist", input_dir=args.input))
1092
+ return
1093
+ logger.info(t("scanning_directory", directory=str(input_dir)))
1094
+
1095
+ # Load rules file
1096
+ rules_file = Path(args.rules)
1097
+ if not rules_file.exists() or not rules_file.is_file():
1098
+ rule_files_in_input_dir = list(input_dir.glob("rules*.json"))
1099
+
1100
+ if rule_files_in_input_dir:
1101
+ rules_file = rule_files_in_input_dir[0]
1102
+ else:
1103
+ logger.error(t("rules_file_does_not_exist_alt", rules_file=args.rules))
1104
+ return
1105
+ logger.info(t("using_rules_file", rules_file=str(rules_file)))
1106
+
1107
+ try:
1108
+ with open(rules_file, encoding="utf-8") as f:
1109
+ rules_data = json.load(f)
1110
+ except json.JSONDecodeError as e:
1111
+ logger.error(t("invalid_json_in_rules_file", error=e))
1112
+ return
1113
+
1114
+ # Parse rules
1115
+ rules = []
1116
+ if isinstance(rules_data, list):
1117
+ rules = [Rule(rule) for rule in rules_data]
1118
+ elif isinstance(rules_data, dict) and "rules" in rules_data:
1119
+ rules = [Rule(rule) for rule in rules_data["rules"]]
1120
+ else:
1121
+ logger.error(t("invalid_rules_format"))
1122
+ return
1123
+
1124
+ if not rules:
1125
+ logger.error(t("no_valid_rules_found"))
1126
+ return
1127
+
1128
+ # Parse file types
1129
+ file_types = [ft.strip() for ft in args.file_types.split(",")]
1130
+
1131
+ # Create scanner and run scan
1132
+ scanner = DocumentScanner(input_dir, rules, file_types, args.use_pdf_ocr, args.use_process_pool, args.batch_size)
1133
+ results = scanner.scan(threads=args.threads, show_progress=args.progress)
1134
+
1135
+ # Save results to JSON file in input directory
1136
+ output_file = input_dir / f"scan_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
1137
+ with open(output_file, "w", encoding="utf-8") as f:
1138
+ json.dump(results, f, indent=2, ensure_ascii=False)
1139
+
1140
+ logger.info(t("results_saved_to", path=str(output_file)))
1141
+ logger.info(t("total_time_elapsed", time=round(time.perf_counter() - t0, 2)))
1142
+
1143
+
1144
+ if __name__ == "__main__":
1145
+ main()