pysfi 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sfi/docscan/docscan.py ADDED
@@ -0,0 +1,841 @@
1
+ """Scan documents and extract text, images, and metadata with certain rules."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import csv
7
+ import html
8
+ import json
9
+ import logging
10
+ import re
11
+ import threading
12
+ import time
13
+ import xml.etree.ElementTree as ET
14
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from typing import Any, Callable
18
+
19
+ try:
20
+ import fitz # PyMuPDF
21
+ except ImportError:
22
+ fitz = None
23
+
24
+ try:
25
+ from docx import Document
26
+ except ImportError:
27
+ Document = None
28
+
29
+ try:
30
+ from openpyxl import load_workbook
31
+ except ImportError:
32
+ load_workbook = None
33
+
34
+ try:
35
+ from PIL import Image
36
+ except ImportError:
37
+ Image = None
38
+
39
+ try:
40
+ import pytesseract
41
+ except ImportError:
42
+ pytesseract = None
43
+
44
+ try:
45
+ import odf.opendocument as odf_odt # ODT support
46
+ except ImportError:
47
+ odf_odt = None
48
+
49
+ try:
50
+ import ebooklib # EPUB support
51
+ from ebooklib import epub
52
+ except ImportError:
53
+ ebooklib = None
54
+
55
+ try:
56
+ import markdown # Markdown to text
57
+ except ImportError:
58
+ markdown = None
59
+
60
+ try:
61
+ import pypdf # Alternative PDF library
62
+ except ImportError:
63
+ pypdf = None
64
+
65
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
66
+ logger = logging.getLogger(__name__)
67
+ cwd = Path.cwd()
68
+
69
+
70
+ class Rule:
71
+ """Represents a scanning rule with optimized pattern matching."""
72
+
73
+ def __init__(self, rule_data: dict[str, Any]):
74
+ """Initialize rule from dictionary."""
75
+ self.name = rule_data.get("name", "")
76
+ self.pattern = rule_data.get("pattern", "")
77
+ self.is_regex = rule_data.get("regex", False)
78
+ self.case_sensitive = rule_data.get("case_sensitive", False)
79
+ self.context_lines = rule_data.get("context_lines", 3)
80
+ self.description = rule_data.get("description", "")
81
+
82
+ if self.is_regex:
83
+ flags = 0 if self.case_sensitive else re.IGNORECASE
84
+ try:
85
+ # Use re.ASCII for faster matching when possible
86
+ self.compiled_pattern = re.compile(self.pattern, flags | re.ASCII)
87
+ except re.error as e:
88
+ logger.warning(f"Invalid regex pattern '{self.pattern}': {e}")
89
+ self.compiled_pattern = None
90
+ else:
91
+ self.compiled_pattern = None
92
+
93
+ def search(self, text: str) -> list[dict[str, Any]]:
94
+ """Search for pattern in text and return matches."""
95
+ if not text or not self.pattern:
96
+ return []
97
+
98
+ matches = []
99
+ lines = text.split("\n")
100
+
101
+ if self.is_regex and self.compiled_pattern:
102
+ # Regex search
103
+ for line_num, line in enumerate(lines, 1):
104
+ for match in self.compiled_pattern.finditer(line):
105
+ matches.append({
106
+ "type": "regex",
107
+ "line_number": line_num,
108
+ "match": match.group(),
109
+ "start": match.start(),
110
+ "end": match.end(),
111
+ "context": self._get_context(lines, line_num - 1),
112
+ })
113
+ else:
114
+ # Simple text search
115
+ search_text = self.pattern if self.case_sensitive else self.pattern.lower()
116
+ for line_num, line in enumerate(lines, 1):
117
+ compare_line = line if self.case_sensitive else line.lower()
118
+ start = 0
119
+ while True:
120
+ pos = compare_line.find(search_text, start)
121
+ if pos == -1:
122
+ break
123
+ matches.append({
124
+ "type": "text",
125
+ "line_number": line_num,
126
+ "match": line[pos : pos + len(self.pattern)],
127
+ "start": pos,
128
+ "end": pos + len(self.pattern),
129
+ "context": self._get_context(lines, line_num - 1),
130
+ })
131
+ start = pos + 1
132
+
133
+ return matches
134
+
135
+ def _get_context(self, lines: list[str], line_index: int) -> list[str]:
136
+ """Get context lines around a match."""
137
+ start = max(0, line_index - self.context_lines)
138
+ end = min(len(lines), line_index + self.context_lines + 1)
139
+ return lines[start:end]
140
+
141
+
142
+ class DocumentScanner:
143
+ """High-performance document scanner with multi-format support."""
144
+
145
+ def __init__(
146
+ self,
147
+ input_dir: Path,
148
+ rules: list[Rule],
149
+ file_types: list[str],
150
+ use_pdf_ocr: bool = False,
151
+ use_process_pool: bool = False,
152
+ batch_size: int = 50,
153
+ ):
154
+ """Initialize scanner with input directory and rules.
155
+
156
+ Args:
157
+ input_dir: Directory containing documents to scan
158
+ rules: List of scanning rules
159
+ file_types: List of file extensions to scan
160
+ use_pdf_ocr: Use OCR for PDF files
161
+ use_process_pool: Use process pool instead of thread pool for CPU-intensive tasks
162
+ batch_size: Number of files to process in each batch
163
+ """
164
+ self.input_dir = Path(input_dir)
165
+ self.rules = rules
166
+ self.file_types = file_types
167
+ self.use_pdf_ocr = use_pdf_ocr
168
+ self.use_process_pool = use_process_pool
169
+ self.batch_size = batch_size
170
+ self.results = []
171
+ self.paused = False
172
+ self.paused_event = threading.Event()
173
+ self.paused_event.set() # Initially not paused
174
+ self.stopped = False
175
+ self._progress_callback = None
176
+
177
+ def set_progress_callback(self, callback: Callable[[int, int], None]) -> None:
178
+ """Set callback function for progress updates.
179
+
180
+ Args:
181
+ callback: Function to call with progress (current, total)
182
+ """
183
+ self._progress_callback = callback
184
+
185
+ def pause(self) -> None:
186
+ """Pause the scanning process."""
187
+ self.paused = True
188
+ self.paused_event.clear()
189
+ logger.info("Scan paused")
190
+
191
+ def resume(self) -> None:
192
+ """Resume the scanning process."""
193
+ self.paused = False
194
+ self.paused_event.set()
195
+ logger.info("Scan resumed")
196
+
197
+ def stop(self) -> None:
198
+ """Stop the scanning process."""
199
+ self.stopped = True
200
+ self.paused_event.set() # Ensure thread can exit
201
+ logger.info("Scan stopped")
202
+
203
+ def is_paused(self) -> bool:
204
+ """Check if the scanner is paused."""
205
+ return self.paused
206
+
207
+ def is_stopped(self) -> bool:
208
+ """Check if the scanner is stopped."""
209
+ return self.stopped
210
+
211
+ def scan(self, threads: int = 4, show_progress: bool = False) -> dict[str, Any]:
212
+ """Scan all documents in input directory.
213
+
214
+ Args:
215
+ threads: Number of worker threads/processes
216
+ show_progress: Show progress bar
217
+
218
+ Returns:
219
+ Dictionary containing scan results
220
+ """
221
+ self.stopped = False
222
+ self.paused = False
223
+ self.paused_event.set()
224
+
225
+ logger.info(f"Scanning directory: {self.input_dir}")
226
+ files = self._collect_files()
227
+ logger.info(f"Found {len(files)} files to scan")
228
+
229
+ results = {
230
+ "scan_info": {
231
+ "input_directory": str(self.input_dir),
232
+ "scan_time": datetime.now().isoformat(),
233
+ "file_types_scanned": self.file_types,
234
+ "total_files": len(files),
235
+ "rules_count": len(self.rules),
236
+ "use_pdf_ocr": self.use_pdf_ocr,
237
+ "use_process_pool": self.use_process_pool,
238
+ },
239
+ "rules": [{"name": r.name, "pattern": r.pattern, "is_regex": r.is_regex} for r in self.rules],
240
+ "matches": [],
241
+ }
242
+
243
+ # Scan files in parallel
244
+ processed = 0
245
+ executor_class = ProcessPoolExecutor if self.use_process_pool else ThreadPoolExecutor
246
+ with executor_class(max_workers=threads) as executor:
247
+ future_to_file = {executor.submit(self._scan_file_with_pause_check, file): file for file in files}
248
+
249
+ for future in as_completed(future_to_file):
250
+ # Check if stopped before processing this future
251
+ if self.stopped:
252
+ logger.info("Scan stopped by user, cancelling remaining tasks...")
253
+ # Cancel all remaining futures
254
+ for f in future_to_file:
255
+ if not f.done():
256
+ f.cancel()
257
+ break
258
+
259
+ # Wait if paused
260
+ while self.paused:
261
+ time.sleep(0.1)
262
+ if self.stopped:
263
+ logger.info("Scan stopped while paused")
264
+ break
265
+
266
+ file_path = future_to_file[future]
267
+ try:
268
+ file_result = future.result()
269
+ if file_result and file_result["matches"]:
270
+ results["matches"].append(file_result)
271
+ logger.info(f"Found matches in: {file_path.name}")
272
+ except Exception as e:
273
+ logger.error(f"Error scanning {file_path}: {e}")
274
+
275
+ processed += 1
276
+
277
+ # Report progress
278
+ if show_progress and processed % 10 == 0:
279
+ logger.info(f"Progress: {processed}/{len(files)} files processed")
280
+
281
+ # Call progress callback if set
282
+ if self._progress_callback:
283
+ self._progress_callback(processed, len(files))
284
+
285
+ results["scan_info"]["files_with_matches"] = len(results["matches"])
286
+ results["scan_info"]["files_processed"] = processed
287
+ results["stopped"] = self.stopped
288
+
289
+ if self.stopped:
290
+ logger.info(f"Scan stopped. Processed {processed} files")
291
+ else:
292
+ logger.info(f"Scan complete. Found matches in {len(results['matches'])} files")
293
+
294
+ return results
295
+
296
+ def _scan_file_with_pause_check(self, file_path: Path) -> dict[str, Any]:
297
+ """Scan a single file with pause check."""
298
+ # Check if stopped before processing
299
+ if self.stopped:
300
+ return {}
301
+
302
+ return self._scan_file(file_path)
303
+
304
+ def _collect_files(self) -> list[Path]:
305
+ """Collect all files matching the specified types."""
306
+ files = []
307
+ for ext in self.file_types:
308
+ files.extend(self.input_dir.rglob(f"*.{ext.lower()}"))
309
+ files.extend(self.input_dir.rglob(f"*.{ext.upper()}"))
310
+ return list(set(files)) # Remove duplicates
311
+
312
+ def _scan_file(self, file_path: Path) -> dict[str, Any]:
313
+ """Scan a single file and return matches."""
314
+ file_start_time = time.perf_counter()
315
+ ext = file_path.suffix.lower().lstrip(".")
316
+ text = ""
317
+ metadata = {}
318
+
319
+ try:
320
+ # Route to appropriate extractor
321
+ if ext == "pdf":
322
+ text, metadata = self._extract_pdf(file_path)
323
+ elif ext == "odt":
324
+ text, metadata = self._extract_odt(file_path)
325
+ elif ext == "rtf":
326
+ text, metadata = self._extract_rtf(file_path)
327
+ elif ext == "epub":
328
+ text, metadata = self._extract_epub(file_path)
329
+ elif ext == "csv":
330
+ text, metadata = self._extract_csv(file_path)
331
+ elif ext == "xml":
332
+ text, metadata = self._extract_xml(file_path)
333
+ elif ext == "html" or ext == "htm":
334
+ text, metadata = self._extract_html(file_path)
335
+ elif ext == "md":
336
+ text, metadata = self._extract_markdown(file_path)
337
+ elif ext in ["docx", "doc"]:
338
+ text, metadata = self._extract_docx(file_path)
339
+ elif ext in ["xlsx", "xls"]:
340
+ text, metadata = self._extract_xlsx(file_path)
341
+ elif ext in ["pptx", "ppt"]:
342
+ text, metadata = self._extract_pptx(file_path)
343
+ elif ext in ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]:
344
+ text, metadata = self._extract_image(file_path)
345
+ else:
346
+ text, metadata = self._extract_text(file_path)
347
+ except Exception as e:
348
+ logger.warning(f"Could not extract text from {file_path}: {e}")
349
+ return {}
350
+
351
+ processing_time = time.perf_counter() - file_start_time
352
+
353
+ if not text:
354
+ return {}
355
+
356
+ # Apply all rules
357
+ file_matches = []
358
+ for rule in self.rules:
359
+ rule_matches = rule.search(text)
360
+ if rule_matches:
361
+ for match in rule_matches:
362
+ match["rule_name"] = rule.name
363
+ match["rule_description"] = rule.description
364
+ file_matches.extend(rule_matches)
365
+
366
+ if not file_matches:
367
+ return {}
368
+
369
+ # Add processing time to metadata
370
+ metadata["processing_time_seconds"] = round(processing_time, 3)
371
+
372
+ logger.info(f"Processed {file_path.name} ({ext}) in {processing_time:.3f}s - {len(file_matches)} matches found")
373
+
374
+ return {
375
+ "file_path": str(file_path),
376
+ "file_type": ext,
377
+ "file_size": file_path.stat().st_size,
378
+ "metadata": metadata,
379
+ "matches": file_matches,
380
+ }
381
+
382
+ def _extract_pdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
383
+ """Extract text from PDF file with fallback."""
384
+ if fitz is not None:
385
+ return self._extract_pdf_fitz(file_path)
386
+ elif pypdf is not None:
387
+ return self._extract_pdf_pypdf(file_path)
388
+ else:
389
+ logger.warning("No PDF library installed (pymupdf or pypdf)")
390
+ return "", {}
391
+
392
+ def _extract_pdf_fitz(self, file_path: Path) -> tuple[str, dict[str, Any]]:
393
+ """Extract text from PDF using PyMuPDF (fastest method)."""
394
+ if not fitz:
395
+ logger.warning("PyMuPDF not installed")
396
+ return "", {}
397
+
398
+ doc = fitz.open(str(file_path))
399
+ if doc.page_count == 0:
400
+ logger.warning(f"No pages found in {file_path}")
401
+ return "", {}
402
+ if not doc.metadata:
403
+ logger.warning(f"No metadata found in {file_path}")
404
+ return "", {}
405
+
406
+ text_parts = []
407
+ metadata = {
408
+ "page_count": doc.page_count,
409
+ "title": doc.metadata.get("title", ""),
410
+ "author": doc.metadata.get("author", ""),
411
+ "subject": doc.metadata.get("subject", ""),
412
+ "creator": doc.metadata.get("creator", ""),
413
+ }
414
+
415
+ if self.use_pdf_ocr and pytesseract and Image:
416
+ # OCR for image-based PDFs
417
+ import io
418
+
419
+ for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
420
+ pix = page.get_pixmap()
421
+ img_data = pix.tobytes("png")
422
+ image = Image.open(io.BytesIO(img_data))
423
+ text = pytesseract.image_to_string(image)
424
+ text_parts.append(f"[Page {page_num}]\n{text}")
425
+ else:
426
+ # Extract text directly (faster)
427
+ for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
428
+ text = page.get_text()
429
+ text_parts.append(f"[Page {page_num}]\n{text}")
430
+
431
+ doc.close()
432
+ return "\n\n".join(text_parts), metadata
433
+
434
+ def _extract_pdf_pypdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
435
+ """Extract text from PDF using pypdf (fallback method)."""
436
+ if not pypdf:
437
+ logger.warning("pypdf not installed, skipping PDF extraction")
438
+ return "", {}
439
+
440
+ text_parts = []
441
+ metadata = {}
442
+ try:
443
+ with open(file_path, "rb") as f:
444
+ pdf_reader = pypdf.PdfReader(f)
445
+
446
+ if not pdf_reader.metadata:
447
+ logger.warning(f"No metadata found in {file_path}")
448
+ return "", {}
449
+
450
+ metadata = {
451
+ "page_count": len(pdf_reader.pages),
452
+ "title": pdf_reader.metadata.get("/Title", ""),
453
+ "author": pdf_reader.metadata.get("/Author", ""),
454
+ }
455
+
456
+ for page_num, page in enumerate(pdf_reader.pages, 1):
457
+ text = page.extract_text()
458
+ text_parts.append(f"[Page {page_num}]\n{text}")
459
+
460
+ except Exception as e:
461
+ logger.warning(f"Error extracting PDF with pypdf: {e}")
462
+ return "", {}
463
+
464
+ return "\n\n".join(text_parts), metadata
465
+
466
+ def _extract_odt(self, file_path: Path) -> tuple[str, dict[str, Any]]:
467
+ """Extract text from ODT (OpenDocument Text) file."""
468
+ if odf_odt is None:
469
+ logger.warning("odfpy not installed, skipping ODT extraction")
470
+ return "", {}
471
+
472
+ try:
473
+ doc = odf_odt.load(file_path)
474
+ text = doc.textual_content # pyright: ignore[reportAttributeAccessIssue]
475
+
476
+ metadata = {
477
+ "format": "ODT",
478
+ }
479
+
480
+ return text, metadata
481
+ except Exception as e:
482
+ logger.warning(f"Error extracting ODT: {e}")
483
+ return "", {}
484
+
485
+ def _extract_rtf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
486
+ """Extract text from RTF (Rich Text Format) file."""
487
+ try:
488
+ with open(file_path, "rb") as f:
489
+ content = f.read()
490
+
491
+ # Simple RTF text extraction (removes control words)
492
+ text = ""
493
+ i = 0
494
+ while i < len(content):
495
+ if content[i] == ord("\\") and i + 1 < len(content):
496
+ if content[i + 1] in [ord("'"), ord("*"), ord("\\")]:
497
+ i += 2
498
+ continue
499
+ # Skip control words
500
+ while (
501
+ i < len(content)
502
+ and content[i] != ord(" ")
503
+ and content[i] != ord("{")
504
+ and content[i] != ord("}")
505
+ ):
506
+ i += 1
507
+ elif content[i] >= 32 and content[i] <= 126: # Printable ASCII
508
+ text += chr(content[i])
509
+ i += 1
510
+
511
+ metadata = {
512
+ "format": "RTF",
513
+ }
514
+
515
+ return text, metadata
516
+ except Exception as e:
517
+ logger.warning(f"Error extracting RTF: {e}")
518
+ return "", {}
519
+
520
+ def _extract_epub(self, file_path: Path) -> tuple[str, dict[str, Any]]:
521
+ """Extract text from EPUB (ebook) file."""
522
+ if ebooklib is None:
523
+ logger.warning("ebooklib not installed, skipping EPUB extraction")
524
+ return "", {}
525
+
526
+ try:
527
+ book = epub.read_epub(file_path)
528
+ text_parts = []
529
+
530
+ # Extract text from all items
531
+ for item in book.get_items():
532
+ if item.get_type() == ebooklib.ITEM_DOCUMENT: # pyright: ignore[reportAttributeAccessIssue]
533
+ # Remove HTML tags
534
+ html_content = item.get_content().decode("utf-8") # pyright: ignore[reportAttributeAccessIssue]
535
+ import re
536
+
537
+ text = re.sub(r"<[^>]+>", " ", html_content)
538
+ text = html.unescape(text)
539
+ text_parts.append(text)
540
+
541
+ metadata = {
542
+ "title": book.get_metadata("DC", "title")[0][0] if book.get_metadata("DC", "title") else "", # pyright: ignore[reportAttributeAccessIssue]
543
+ "author": book.get_metadata("DC", "creator")[0][0] if book.get_metadata("DC", "creator") else "", # pyright: ignore[reportAttributeAccessIssue]
544
+ "format": "EPUB",
545
+ }
546
+
547
+ return "\n\n".join(text_parts), metadata
548
+ except Exception as e:
549
+ logger.warning(f"Error extracting EPUB: {e}")
550
+ return "", {}
551
+
552
+ def _extract_csv(self, file_path: Path) -> tuple[str, dict[str, Any]]:
553
+ """Extract text from CSV file."""
554
+ try:
555
+ text_parts = []
556
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
557
+ reader = csv.reader(f)
558
+ for row in reader:
559
+ row_text = " | ".join(str(cell) for cell in row)
560
+ text_parts.append(row_text)
561
+
562
+ metadata = {
563
+ "format": "CSV",
564
+ }
565
+
566
+ return "\n".join(text_parts), metadata
567
+ except Exception as e:
568
+ logger.warning(f"Error extracting CSV: {e}")
569
+ return "", {}
570
+
571
+ def _extract_xml(self, file_path: Path) -> tuple[str, dict[str, Any]]:
572
+ """Extract text from XML file."""
573
+ try:
574
+ tree = ET.parse(file_path)
575
+ root = tree.getroot()
576
+
577
+ # Extract all text content
578
+ text_parts = [elem.text for elem in root.iter() if elem.text and elem.text.strip()]
579
+ text = "\n".join(text_parts)
580
+
581
+ metadata = {
582
+ "format": "XML",
583
+ "root_tag": root.tag,
584
+ }
585
+
586
+ return text, metadata
587
+ except Exception as e:
588
+ logger.warning(f"Error extracting XML: {e}")
589
+ return "", {}
590
+
591
+ def _extract_html(self, file_path: Path) -> tuple[str, dict[str, Any]]:
592
+ """Extract text from HTML file."""
593
+ try:
594
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
595
+ html_content = f.read()
596
+
597
+ # Remove HTML tags
598
+ import re
599
+
600
+ text = re.sub(r"<[^>]+>", " ", html_content)
601
+ text = html.unescape(text)
602
+ text = re.sub(r"\s+", " ", text).strip()
603
+
604
+ metadata = {
605
+ "format": "HTML",
606
+ }
607
+
608
+ return text, metadata
609
+ except Exception as e:
610
+ logger.warning(f"Error extracting HTML: {e}")
611
+ return "", {}
612
+
613
+ def _extract_markdown(self, file_path: Path) -> tuple[str, dict[str, Any]]:
614
+ """Extract text from Markdown file."""
615
+ try:
616
+ with open(file_path, encoding="utf-8", errors="ignore") as f:
617
+ content = f.read()
618
+
619
+ if markdown:
620
+ # Convert Markdown to HTML then extract text
621
+ html_content = markdown.markdown(content) # pyright: ignore[reportAttributeAccessIssue]
622
+ import re
623
+
624
+ text = re.sub(r"<[^>]+>", " ", html_content)
625
+ text = html.unescape(text)
626
+ text = re.sub(r"\s+", " ", text).strip()
627
+ else:
628
+ # Simple Markdown processing
629
+ text = content
630
+
631
+ metadata = {
632
+ "format": "Markdown",
633
+ }
634
+
635
+ return text, metadata
636
+ except Exception as e:
637
+ logger.warning(f"Error extracting Markdown: {e}")
638
+ return "", {}
639
+
640
+ def _extract_docx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
641
+ """Extract text from DOCX file."""
642
+ if Document is None:
643
+ logger.warning("python-docx not installed, skipping DOCX extraction")
644
+ return "", {}
645
+
646
+ doc = Document(str(file_path))
647
+ text_parts = []
648
+
649
+ for paragraph in doc.paragraphs:
650
+ text_parts.append(paragraph.text)
651
+
652
+ # Extract tables
653
+ for table in doc.tables:
654
+ for row in table.rows:
655
+ row_text = " | ".join(cell.text for cell in row.cells)
656
+ text_parts.append(row_text)
657
+
658
+ metadata = {
659
+ "paragraph_count": len(doc.paragraphs),
660
+ "table_count": len(doc.tables),
661
+ }
662
+
663
+ return "\n".join(text_parts), metadata
664
+
665
+ def _extract_xlsx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
666
+ """Extract text from XLSX file."""
667
+ if load_workbook is None:
668
+ logger.warning("openpyxl not installed, skipping XLSX extraction")
669
+ return "", {}
670
+
671
+ wb = load_workbook(file_path, read_only=True, data_only=True)
672
+ text_parts = []
673
+
674
+ for sheet_name in wb.sheetnames:
675
+ sheet = wb[sheet_name]
676
+ text_parts.append(f"[Sheet: {sheet_name}]")
677
+ for row in sheet.iter_rows(values_only=True):
678
+ row_text = " | ".join(str(cell) if cell is not None else "" for cell in row)
679
+ if row_text.strip():
680
+ text_parts.append(row_text)
681
+
682
+ metadata = {
683
+ "sheet_count": len(wb.sheetnames),
684
+ "sheets": wb.sheetnames,
685
+ }
686
+
687
+ return "\n".join(text_parts), metadata
688
+
689
+ def _extract_pptx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
690
+ """Extract text from PPTX file."""
691
+ try:
692
+ from pptx import Presentation
693
+ except ImportError:
694
+ logger.warning("python-pptx not installed, skipping PPTX extraction")
695
+ return "", {}
696
+
697
+ prs = Presentation(str(file_path))
698
+ text_parts = []
699
+
700
+ for slide_num, slide in enumerate(prs.slides, 1):
701
+ text_parts.append(f"[Slide {slide_num}]")
702
+ for shape in slide.shapes:
703
+ if hasattr(shape, "text"):
704
+ text_parts.append(shape.text) # pyright: ignore[reportAttributeAccessIssue]
705
+
706
+ metadata = {
707
+ "slide_count": len(prs.slides),
708
+ }
709
+
710
+ return "\n".join(text_parts), metadata
711
+
712
+ def _extract_image(self, file_path: Path) -> tuple[str, dict[str, Any]]:
713
+ """Extract text from image file using OCR."""
714
+ if Image is None or pytesseract is None:
715
+ logger.warning("PIL or pytesseract not installed, skipping image OCR")
716
+ return "", {}
717
+
718
+ try:
719
+ img = Image.open(file_path)
720
+ text = pytesseract.image_to_string(img)
721
+
722
+ metadata = {
723
+ "format": img.format,
724
+ "mode": img.mode,
725
+ "size": img.size,
726
+ }
727
+
728
+ return text, metadata
729
+ except Exception as e:
730
+ logger.warning(f"Could not perform OCR on {file_path}: {e}")
731
+ return "", {}
732
+
733
+ def _extract_text(self, file_path: Path) -> tuple[str, dict[str, Any]]:
734
+ """Extract text from plain text file."""
735
+ encodings = ["utf-8", "latin-1", "cp1252", "utf-16"]
736
+
737
+ for encoding in encodings:
738
+ try:
739
+ with open(file_path, encoding=encoding, errors="ignore") as f:
740
+ text = f.read()
741
+ return text, {"encoding": encoding}
742
+ except UnicodeDecodeError:
743
+ continue
744
+
745
+ return "", {}
746
+
747
+
748
+ def main():
749
+ """Main entry point for document scanner."""
750
+ parser = argparse.ArgumentParser(
751
+ description="Scan documents and extract text, images, and metadata with high performance"
752
+ )
753
+ parser.add_argument("input", type=str, nargs="?", default=str(cwd), help="Input directory")
754
+ parser.add_argument("-r", "--rules", type=str, default="rules.json", help="Rules file (JSON)")
755
+ parser.add_argument("--recursive", action="store_true", help="Scan files recursively")
756
+ parser.add_argument(
757
+ "-f",
758
+ "--file-types",
759
+ help="File types to scan (comma-separated)",
760
+ default="pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md",
761
+ )
762
+ parser.add_argument("--use-pdf-ocr", help="Use PDF OCR for image-based PDFs", action="store_true")
763
+ parser.add_argument(
764
+ "--use-process-pool",
765
+ help="Use process pool instead of thread pool (better for CPU-intensive tasks)",
766
+ action="store_true",
767
+ )
768
+ parser.add_argument(
769
+ "-b",
770
+ "--batch-size",
771
+ help="Number of files to process in each batch",
772
+ default=50,
773
+ type=int,
774
+ )
775
+ parser.add_argument("-t", "--threads", help="Number of threads for parallel scanning", default=4, type=int)
776
+ parser.add_argument("--progress", help="Show progress bar", action="store_true")
777
+ parser.add_argument("-v", "--verbose", help="Verbose output", action="store_true")
778
+ args = parser.parse_args()
779
+
780
+ if args.verbose:
781
+ logger.setLevel(logging.DEBUG)
782
+
783
+ t0 = time.perf_counter()
784
+ # Validate input directory
785
+ input_dir = Path(args.input)
786
+ if not input_dir.exists() or not input_dir.is_dir():
787
+ logger.error(f"Input directory does not exist: {args.input}")
788
+ return
789
+ logger.info(f"Scanning directory: {input_dir}...")
790
+
791
+ # Load rules file
792
+ rules_file = Path(args.rules)
793
+ if not rules_file.exists() or not rules_file.is_file():
794
+ rule_files_in_input_dir = list(input_dir.glob("rules*.json"))
795
+
796
+ if rule_files_in_input_dir:
797
+ rules_file = rule_files_in_input_dir[0]
798
+ else:
799
+ logger.error(f"Rules file does not exist: {args.rules}")
800
+ return
801
+ logger.info(f"Using rules file: {rules_file}")
802
+
803
+ try:
804
+ with open(rules_file, encoding="utf-8") as f:
805
+ rules_data = json.load(f)
806
+ except json.JSONDecodeError as e:
807
+ logger.error(f"Invalid JSON in rules file: {e}")
808
+ return
809
+
810
+ # Parse rules
811
+ rules = []
812
+ if isinstance(rules_data, list):
813
+ rules = [Rule(rule) for rule in rules_data]
814
+ elif isinstance(rules_data, dict) and "rules" in rules_data:
815
+ rules = [Rule(rule) for rule in rules_data["rules"]]
816
+ else:
817
+ logger.error("Invalid rules format. Expected a list or dict with 'rules' key")
818
+ return
819
+
820
+ if not rules:
821
+ logger.error("No valid rules found")
822
+ return
823
+
824
+ # Parse file types
825
+ file_types = [ft.strip() for ft in args.file_types.split(",")]
826
+
827
+ # Create scanner and run scan
828
+ scanner = DocumentScanner(input_dir, rules, file_types, args.use_pdf_ocr, args.use_process_pool, args.batch_size)
829
+ results = scanner.scan(threads=args.threads, show_progress=args.progress)
830
+
831
+ # Save results to JSON file in input directory
832
+ output_file = input_dir / f"scan_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
833
+ with open(output_file, "w", encoding="utf-8") as f:
834
+ json.dump(results, f, indent=2, ensure_ascii=False)
835
+
836
+ logger.info(f"Results saved to: {output_file}")
837
+ logger.info(f"Total time elapsed: {time.perf_counter() - t0:.2f}s")
838
+
839
+
840
+ if __name__ == "__main__":
841
+ main()