pysfi 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysfi-0.1.7.dist-info/METADATA +134 -0
- pysfi-0.1.7.dist-info/RECORD +31 -0
- pysfi-0.1.7.dist-info/WHEEL +4 -0
- pysfi-0.1.7.dist-info/entry_points.txt +15 -0
- sfi/__init__.py +3 -0
- sfi/alarmclock/__init__.py +0 -0
- sfi/alarmclock/alarmclock.py +367 -0
- sfi/bumpversion/__init__.py +3 -0
- sfi/bumpversion/bumpversion.py +535 -0
- sfi/cli.py +11 -0
- sfi/docscan/__init__.py +3 -0
- sfi/docscan/docscan.py +841 -0
- sfi/docscan/docscan_gui.py +596 -0
- sfi/embedinstall/__init__.py +0 -0
- sfi/embedinstall/embedinstall.py +418 -0
- sfi/filedate/__init__.py +0 -0
- sfi/filedate/filedate.py +112 -0
- sfi/makepython/__init__.py +0 -0
- sfi/makepython/makepython.py +326 -0
- sfi/pdfsplit/__init__.py +0 -0
- sfi/pdfsplit/pdfsplit.py +173 -0
- sfi/projectparse/__init__.py +0 -0
- sfi/projectparse/projectparse.py +152 -0
- sfi/pyloadergen/__init__.py +0 -0
- sfi/pyloadergen/pyloadergen.py +995 -0
- sfi/pypacker/__init__.py +0 -0
- sfi/pypacker/fspacker.py +91 -0
- sfi/taskkill/__init__.py +0 -0
- sfi/taskkill/taskkill.py +236 -0
- sfi/which/__init__.py +0 -0
- sfi/which/which.py +74 -0
sfi/docscan/docscan.py
ADDED
|
@@ -0,0 +1,841 @@
|
|
|
1
|
+
"""Scan documents and extract text, images, and metadata with certain rules."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import csv
|
|
7
|
+
import html
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import re
|
|
11
|
+
import threading
|
|
12
|
+
import time
|
|
13
|
+
import xml.etree.ElementTree as ET
|
|
14
|
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Callable
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
import fitz # PyMuPDF
|
|
21
|
+
except ImportError:
|
|
22
|
+
fitz = None
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from docx import Document
|
|
26
|
+
except ImportError:
|
|
27
|
+
Document = None
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from openpyxl import load_workbook
|
|
31
|
+
except ImportError:
|
|
32
|
+
load_workbook = None
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
from PIL import Image
|
|
36
|
+
except ImportError:
|
|
37
|
+
Image = None
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
import pytesseract
|
|
41
|
+
except ImportError:
|
|
42
|
+
pytesseract = None
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
import odf.opendocument as odf_odt # ODT support
|
|
46
|
+
except ImportError:
|
|
47
|
+
odf_odt = None
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
import ebooklib # EPUB support
|
|
51
|
+
from ebooklib import epub
|
|
52
|
+
except ImportError:
|
|
53
|
+
ebooklib = None
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
import markdown # Markdown to text
|
|
57
|
+
except ImportError:
|
|
58
|
+
markdown = None
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
import pypdf # Alternative PDF library
|
|
62
|
+
except ImportError:
|
|
63
|
+
pypdf = None
|
|
64
|
+
|
|
65
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
66
|
+
logger = logging.getLogger(__name__)
|
|
67
|
+
cwd = Path.cwd()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class Rule:
|
|
71
|
+
"""Represents a scanning rule with optimized pattern matching."""
|
|
72
|
+
|
|
73
|
+
def __init__(self, rule_data: dict[str, Any]):
|
|
74
|
+
"""Initialize rule from dictionary."""
|
|
75
|
+
self.name = rule_data.get("name", "")
|
|
76
|
+
self.pattern = rule_data.get("pattern", "")
|
|
77
|
+
self.is_regex = rule_data.get("regex", False)
|
|
78
|
+
self.case_sensitive = rule_data.get("case_sensitive", False)
|
|
79
|
+
self.context_lines = rule_data.get("context_lines", 3)
|
|
80
|
+
self.description = rule_data.get("description", "")
|
|
81
|
+
|
|
82
|
+
if self.is_regex:
|
|
83
|
+
flags = 0 if self.case_sensitive else re.IGNORECASE
|
|
84
|
+
try:
|
|
85
|
+
# Use re.ASCII for faster matching when possible
|
|
86
|
+
self.compiled_pattern = re.compile(self.pattern, flags | re.ASCII)
|
|
87
|
+
except re.error as e:
|
|
88
|
+
logger.warning(f"Invalid regex pattern '{self.pattern}': {e}")
|
|
89
|
+
self.compiled_pattern = None
|
|
90
|
+
else:
|
|
91
|
+
self.compiled_pattern = None
|
|
92
|
+
|
|
93
|
+
def search(self, text: str) -> list[dict[str, Any]]:
|
|
94
|
+
"""Search for pattern in text and return matches."""
|
|
95
|
+
if not text or not self.pattern:
|
|
96
|
+
return []
|
|
97
|
+
|
|
98
|
+
matches = []
|
|
99
|
+
lines = text.split("\n")
|
|
100
|
+
|
|
101
|
+
if self.is_regex and self.compiled_pattern:
|
|
102
|
+
# Regex search
|
|
103
|
+
for line_num, line in enumerate(lines, 1):
|
|
104
|
+
for match in self.compiled_pattern.finditer(line):
|
|
105
|
+
matches.append({
|
|
106
|
+
"type": "regex",
|
|
107
|
+
"line_number": line_num,
|
|
108
|
+
"match": match.group(),
|
|
109
|
+
"start": match.start(),
|
|
110
|
+
"end": match.end(),
|
|
111
|
+
"context": self._get_context(lines, line_num - 1),
|
|
112
|
+
})
|
|
113
|
+
else:
|
|
114
|
+
# Simple text search
|
|
115
|
+
search_text = self.pattern if self.case_sensitive else self.pattern.lower()
|
|
116
|
+
for line_num, line in enumerate(lines, 1):
|
|
117
|
+
compare_line = line if self.case_sensitive else line.lower()
|
|
118
|
+
start = 0
|
|
119
|
+
while True:
|
|
120
|
+
pos = compare_line.find(search_text, start)
|
|
121
|
+
if pos == -1:
|
|
122
|
+
break
|
|
123
|
+
matches.append({
|
|
124
|
+
"type": "text",
|
|
125
|
+
"line_number": line_num,
|
|
126
|
+
"match": line[pos : pos + len(self.pattern)],
|
|
127
|
+
"start": pos,
|
|
128
|
+
"end": pos + len(self.pattern),
|
|
129
|
+
"context": self._get_context(lines, line_num - 1),
|
|
130
|
+
})
|
|
131
|
+
start = pos + 1
|
|
132
|
+
|
|
133
|
+
return matches
|
|
134
|
+
|
|
135
|
+
def _get_context(self, lines: list[str], line_index: int) -> list[str]:
|
|
136
|
+
"""Get context lines around a match."""
|
|
137
|
+
start = max(0, line_index - self.context_lines)
|
|
138
|
+
end = min(len(lines), line_index + self.context_lines + 1)
|
|
139
|
+
return lines[start:end]
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class DocumentScanner:
|
|
143
|
+
"""High-performance document scanner with multi-format support."""
|
|
144
|
+
|
|
145
|
+
def __init__(
|
|
146
|
+
self,
|
|
147
|
+
input_dir: Path,
|
|
148
|
+
rules: list[Rule],
|
|
149
|
+
file_types: list[str],
|
|
150
|
+
use_pdf_ocr: bool = False,
|
|
151
|
+
use_process_pool: bool = False,
|
|
152
|
+
batch_size: int = 50,
|
|
153
|
+
):
|
|
154
|
+
"""Initialize scanner with input directory and rules.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
input_dir: Directory containing documents to scan
|
|
158
|
+
rules: List of scanning rules
|
|
159
|
+
file_types: List of file extensions to scan
|
|
160
|
+
use_pdf_ocr: Use OCR for PDF files
|
|
161
|
+
use_process_pool: Use process pool instead of thread pool for CPU-intensive tasks
|
|
162
|
+
batch_size: Number of files to process in each batch
|
|
163
|
+
"""
|
|
164
|
+
self.input_dir = Path(input_dir)
|
|
165
|
+
self.rules = rules
|
|
166
|
+
self.file_types = file_types
|
|
167
|
+
self.use_pdf_ocr = use_pdf_ocr
|
|
168
|
+
self.use_process_pool = use_process_pool
|
|
169
|
+
self.batch_size = batch_size
|
|
170
|
+
self.results = []
|
|
171
|
+
self.paused = False
|
|
172
|
+
self.paused_event = threading.Event()
|
|
173
|
+
self.paused_event.set() # Initially not paused
|
|
174
|
+
self.stopped = False
|
|
175
|
+
self._progress_callback = None
|
|
176
|
+
|
|
177
|
+
def set_progress_callback(self, callback: Callable[[int, int], None]) -> None:
|
|
178
|
+
"""Set callback function for progress updates.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
callback: Function to call with progress (current, total)
|
|
182
|
+
"""
|
|
183
|
+
self._progress_callback = callback
|
|
184
|
+
|
|
185
|
+
def pause(self) -> None:
|
|
186
|
+
"""Pause the scanning process."""
|
|
187
|
+
self.paused = True
|
|
188
|
+
self.paused_event.clear()
|
|
189
|
+
logger.info("Scan paused")
|
|
190
|
+
|
|
191
|
+
def resume(self) -> None:
|
|
192
|
+
"""Resume the scanning process."""
|
|
193
|
+
self.paused = False
|
|
194
|
+
self.paused_event.set()
|
|
195
|
+
logger.info("Scan resumed")
|
|
196
|
+
|
|
197
|
+
def stop(self) -> None:
|
|
198
|
+
"""Stop the scanning process."""
|
|
199
|
+
self.stopped = True
|
|
200
|
+
self.paused_event.set() # Ensure thread can exit
|
|
201
|
+
logger.info("Scan stopped")
|
|
202
|
+
|
|
203
|
+
def is_paused(self) -> bool:
|
|
204
|
+
"""Check if the scanner is paused."""
|
|
205
|
+
return self.paused
|
|
206
|
+
|
|
207
|
+
def is_stopped(self) -> bool:
|
|
208
|
+
"""Check if the scanner is stopped."""
|
|
209
|
+
return self.stopped
|
|
210
|
+
|
|
211
|
+
def scan(self, threads: int = 4, show_progress: bool = False) -> dict[str, Any]:
|
|
212
|
+
"""Scan all documents in input directory.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
threads: Number of worker threads/processes
|
|
216
|
+
show_progress: Show progress bar
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Dictionary containing scan results
|
|
220
|
+
"""
|
|
221
|
+
self.stopped = False
|
|
222
|
+
self.paused = False
|
|
223
|
+
self.paused_event.set()
|
|
224
|
+
|
|
225
|
+
logger.info(f"Scanning directory: {self.input_dir}")
|
|
226
|
+
files = self._collect_files()
|
|
227
|
+
logger.info(f"Found {len(files)} files to scan")
|
|
228
|
+
|
|
229
|
+
results = {
|
|
230
|
+
"scan_info": {
|
|
231
|
+
"input_directory": str(self.input_dir),
|
|
232
|
+
"scan_time": datetime.now().isoformat(),
|
|
233
|
+
"file_types_scanned": self.file_types,
|
|
234
|
+
"total_files": len(files),
|
|
235
|
+
"rules_count": len(self.rules),
|
|
236
|
+
"use_pdf_ocr": self.use_pdf_ocr,
|
|
237
|
+
"use_process_pool": self.use_process_pool,
|
|
238
|
+
},
|
|
239
|
+
"rules": [{"name": r.name, "pattern": r.pattern, "is_regex": r.is_regex} for r in self.rules],
|
|
240
|
+
"matches": [],
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
# Scan files in parallel
|
|
244
|
+
processed = 0
|
|
245
|
+
executor_class = ProcessPoolExecutor if self.use_process_pool else ThreadPoolExecutor
|
|
246
|
+
with executor_class(max_workers=threads) as executor:
|
|
247
|
+
future_to_file = {executor.submit(self._scan_file_with_pause_check, file): file for file in files}
|
|
248
|
+
|
|
249
|
+
for future in as_completed(future_to_file):
|
|
250
|
+
# Check if stopped before processing this future
|
|
251
|
+
if self.stopped:
|
|
252
|
+
logger.info("Scan stopped by user, cancelling remaining tasks...")
|
|
253
|
+
# Cancel all remaining futures
|
|
254
|
+
for f in future_to_file:
|
|
255
|
+
if not f.done():
|
|
256
|
+
f.cancel()
|
|
257
|
+
break
|
|
258
|
+
|
|
259
|
+
# Wait if paused
|
|
260
|
+
while self.paused:
|
|
261
|
+
time.sleep(0.1)
|
|
262
|
+
if self.stopped:
|
|
263
|
+
logger.info("Scan stopped while paused")
|
|
264
|
+
break
|
|
265
|
+
|
|
266
|
+
file_path = future_to_file[future]
|
|
267
|
+
try:
|
|
268
|
+
file_result = future.result()
|
|
269
|
+
if file_result and file_result["matches"]:
|
|
270
|
+
results["matches"].append(file_result)
|
|
271
|
+
logger.info(f"Found matches in: {file_path.name}")
|
|
272
|
+
except Exception as e:
|
|
273
|
+
logger.error(f"Error scanning {file_path}: {e}")
|
|
274
|
+
|
|
275
|
+
processed += 1
|
|
276
|
+
|
|
277
|
+
# Report progress
|
|
278
|
+
if show_progress and processed % 10 == 0:
|
|
279
|
+
logger.info(f"Progress: {processed}/{len(files)} files processed")
|
|
280
|
+
|
|
281
|
+
# Call progress callback if set
|
|
282
|
+
if self._progress_callback:
|
|
283
|
+
self._progress_callback(processed, len(files))
|
|
284
|
+
|
|
285
|
+
results["scan_info"]["files_with_matches"] = len(results["matches"])
|
|
286
|
+
results["scan_info"]["files_processed"] = processed
|
|
287
|
+
results["stopped"] = self.stopped
|
|
288
|
+
|
|
289
|
+
if self.stopped:
|
|
290
|
+
logger.info(f"Scan stopped. Processed {processed} files")
|
|
291
|
+
else:
|
|
292
|
+
logger.info(f"Scan complete. Found matches in {len(results['matches'])} files")
|
|
293
|
+
|
|
294
|
+
return results
|
|
295
|
+
|
|
296
|
+
def _scan_file_with_pause_check(self, file_path: Path) -> dict[str, Any]:
|
|
297
|
+
"""Scan a single file with pause check."""
|
|
298
|
+
# Check if stopped before processing
|
|
299
|
+
if self.stopped:
|
|
300
|
+
return {}
|
|
301
|
+
|
|
302
|
+
return self._scan_file(file_path)
|
|
303
|
+
|
|
304
|
+
def _collect_files(self) -> list[Path]:
|
|
305
|
+
"""Collect all files matching the specified types."""
|
|
306
|
+
files = []
|
|
307
|
+
for ext in self.file_types:
|
|
308
|
+
files.extend(self.input_dir.rglob(f"*.{ext.lower()}"))
|
|
309
|
+
files.extend(self.input_dir.rglob(f"*.{ext.upper()}"))
|
|
310
|
+
return list(set(files)) # Remove duplicates
|
|
311
|
+
|
|
312
|
+
def _scan_file(self, file_path: Path) -> dict[str, Any]:
|
|
313
|
+
"""Scan a single file and return matches."""
|
|
314
|
+
file_start_time = time.perf_counter()
|
|
315
|
+
ext = file_path.suffix.lower().lstrip(".")
|
|
316
|
+
text = ""
|
|
317
|
+
metadata = {}
|
|
318
|
+
|
|
319
|
+
try:
|
|
320
|
+
# Route to appropriate extractor
|
|
321
|
+
if ext == "pdf":
|
|
322
|
+
text, metadata = self._extract_pdf(file_path)
|
|
323
|
+
elif ext == "odt":
|
|
324
|
+
text, metadata = self._extract_odt(file_path)
|
|
325
|
+
elif ext == "rtf":
|
|
326
|
+
text, metadata = self._extract_rtf(file_path)
|
|
327
|
+
elif ext == "epub":
|
|
328
|
+
text, metadata = self._extract_epub(file_path)
|
|
329
|
+
elif ext == "csv":
|
|
330
|
+
text, metadata = self._extract_csv(file_path)
|
|
331
|
+
elif ext == "xml":
|
|
332
|
+
text, metadata = self._extract_xml(file_path)
|
|
333
|
+
elif ext == "html" or ext == "htm":
|
|
334
|
+
text, metadata = self._extract_html(file_path)
|
|
335
|
+
elif ext == "md":
|
|
336
|
+
text, metadata = self._extract_markdown(file_path)
|
|
337
|
+
elif ext in ["docx", "doc"]:
|
|
338
|
+
text, metadata = self._extract_docx(file_path)
|
|
339
|
+
elif ext in ["xlsx", "xls"]:
|
|
340
|
+
text, metadata = self._extract_xlsx(file_path)
|
|
341
|
+
elif ext in ["pptx", "ppt"]:
|
|
342
|
+
text, metadata = self._extract_pptx(file_path)
|
|
343
|
+
elif ext in ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]:
|
|
344
|
+
text, metadata = self._extract_image(file_path)
|
|
345
|
+
else:
|
|
346
|
+
text, metadata = self._extract_text(file_path)
|
|
347
|
+
except Exception as e:
|
|
348
|
+
logger.warning(f"Could not extract text from {file_path}: {e}")
|
|
349
|
+
return {}
|
|
350
|
+
|
|
351
|
+
processing_time = time.perf_counter() - file_start_time
|
|
352
|
+
|
|
353
|
+
if not text:
|
|
354
|
+
return {}
|
|
355
|
+
|
|
356
|
+
# Apply all rules
|
|
357
|
+
file_matches = []
|
|
358
|
+
for rule in self.rules:
|
|
359
|
+
rule_matches = rule.search(text)
|
|
360
|
+
if rule_matches:
|
|
361
|
+
for match in rule_matches:
|
|
362
|
+
match["rule_name"] = rule.name
|
|
363
|
+
match["rule_description"] = rule.description
|
|
364
|
+
file_matches.extend(rule_matches)
|
|
365
|
+
|
|
366
|
+
if not file_matches:
|
|
367
|
+
return {}
|
|
368
|
+
|
|
369
|
+
# Add processing time to metadata
|
|
370
|
+
metadata["processing_time_seconds"] = round(processing_time, 3)
|
|
371
|
+
|
|
372
|
+
logger.info(f"Processed {file_path.name} ({ext}) in {processing_time:.3f}s - {len(file_matches)} matches found")
|
|
373
|
+
|
|
374
|
+
return {
|
|
375
|
+
"file_path": str(file_path),
|
|
376
|
+
"file_type": ext,
|
|
377
|
+
"file_size": file_path.stat().st_size,
|
|
378
|
+
"metadata": metadata,
|
|
379
|
+
"matches": file_matches,
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
def _extract_pdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
383
|
+
"""Extract text from PDF file with fallback."""
|
|
384
|
+
if fitz is not None:
|
|
385
|
+
return self._extract_pdf_fitz(file_path)
|
|
386
|
+
elif pypdf is not None:
|
|
387
|
+
return self._extract_pdf_pypdf(file_path)
|
|
388
|
+
else:
|
|
389
|
+
logger.warning("No PDF library installed (pymupdf or pypdf)")
|
|
390
|
+
return "", {}
|
|
391
|
+
|
|
392
|
+
def _extract_pdf_fitz(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
393
|
+
"""Extract text from PDF using PyMuPDF (fastest method)."""
|
|
394
|
+
if not fitz:
|
|
395
|
+
logger.warning("PyMuPDF not installed")
|
|
396
|
+
return "", {}
|
|
397
|
+
|
|
398
|
+
doc = fitz.open(str(file_path))
|
|
399
|
+
if doc.page_count == 0:
|
|
400
|
+
logger.warning(f"No pages found in {file_path}")
|
|
401
|
+
return "", {}
|
|
402
|
+
if not doc.metadata:
|
|
403
|
+
logger.warning(f"No metadata found in {file_path}")
|
|
404
|
+
return "", {}
|
|
405
|
+
|
|
406
|
+
text_parts = []
|
|
407
|
+
metadata = {
|
|
408
|
+
"page_count": doc.page_count,
|
|
409
|
+
"title": doc.metadata.get("title", ""),
|
|
410
|
+
"author": doc.metadata.get("author", ""),
|
|
411
|
+
"subject": doc.metadata.get("subject", ""),
|
|
412
|
+
"creator": doc.metadata.get("creator", ""),
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
if self.use_pdf_ocr and pytesseract and Image:
|
|
416
|
+
# OCR for image-based PDFs
|
|
417
|
+
import io
|
|
418
|
+
|
|
419
|
+
for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
|
|
420
|
+
pix = page.get_pixmap()
|
|
421
|
+
img_data = pix.tobytes("png")
|
|
422
|
+
image = Image.open(io.BytesIO(img_data))
|
|
423
|
+
text = pytesseract.image_to_string(image)
|
|
424
|
+
text_parts.append(f"[Page {page_num}]\n{text}")
|
|
425
|
+
else:
|
|
426
|
+
# Extract text directly (faster)
|
|
427
|
+
for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
|
|
428
|
+
text = page.get_text()
|
|
429
|
+
text_parts.append(f"[Page {page_num}]\n{text}")
|
|
430
|
+
|
|
431
|
+
doc.close()
|
|
432
|
+
return "\n\n".join(text_parts), metadata
|
|
433
|
+
|
|
434
|
+
def _extract_pdf_pypdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
435
|
+
"""Extract text from PDF using pypdf (fallback method)."""
|
|
436
|
+
if not pypdf:
|
|
437
|
+
logger.warning("pypdf not installed, skipping PDF extraction")
|
|
438
|
+
return "", {}
|
|
439
|
+
|
|
440
|
+
text_parts = []
|
|
441
|
+
metadata = {}
|
|
442
|
+
try:
|
|
443
|
+
with open(file_path, "rb") as f:
|
|
444
|
+
pdf_reader = pypdf.PdfReader(f)
|
|
445
|
+
|
|
446
|
+
if not pdf_reader.metadata:
|
|
447
|
+
logger.warning(f"No metadata found in {file_path}")
|
|
448
|
+
return "", {}
|
|
449
|
+
|
|
450
|
+
metadata = {
|
|
451
|
+
"page_count": len(pdf_reader.pages),
|
|
452
|
+
"title": pdf_reader.metadata.get("/Title", ""),
|
|
453
|
+
"author": pdf_reader.metadata.get("/Author", ""),
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
for page_num, page in enumerate(pdf_reader.pages, 1):
|
|
457
|
+
text = page.extract_text()
|
|
458
|
+
text_parts.append(f"[Page {page_num}]\n{text}")
|
|
459
|
+
|
|
460
|
+
except Exception as e:
|
|
461
|
+
logger.warning(f"Error extracting PDF with pypdf: {e}")
|
|
462
|
+
return "", {}
|
|
463
|
+
|
|
464
|
+
return "\n\n".join(text_parts), metadata
|
|
465
|
+
|
|
466
|
+
def _extract_odt(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
467
|
+
"""Extract text from ODT (OpenDocument Text) file."""
|
|
468
|
+
if odf_odt is None:
|
|
469
|
+
logger.warning("odfpy not installed, skipping ODT extraction")
|
|
470
|
+
return "", {}
|
|
471
|
+
|
|
472
|
+
try:
|
|
473
|
+
doc = odf_odt.load(file_path)
|
|
474
|
+
text = doc.textual_content # pyright: ignore[reportAttributeAccessIssue]
|
|
475
|
+
|
|
476
|
+
metadata = {
|
|
477
|
+
"format": "ODT",
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
return text, metadata
|
|
481
|
+
except Exception as e:
|
|
482
|
+
logger.warning(f"Error extracting ODT: {e}")
|
|
483
|
+
return "", {}
|
|
484
|
+
|
|
485
|
+
def _extract_rtf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
486
|
+
"""Extract text from RTF (Rich Text Format) file."""
|
|
487
|
+
try:
|
|
488
|
+
with open(file_path, "rb") as f:
|
|
489
|
+
content = f.read()
|
|
490
|
+
|
|
491
|
+
# Simple RTF text extraction (removes control words)
|
|
492
|
+
text = ""
|
|
493
|
+
i = 0
|
|
494
|
+
while i < len(content):
|
|
495
|
+
if content[i] == ord("\\") and i + 1 < len(content):
|
|
496
|
+
if content[i + 1] in [ord("'"), ord("*"), ord("\\")]:
|
|
497
|
+
i += 2
|
|
498
|
+
continue
|
|
499
|
+
# Skip control words
|
|
500
|
+
while (
|
|
501
|
+
i < len(content)
|
|
502
|
+
and content[i] != ord(" ")
|
|
503
|
+
and content[i] != ord("{")
|
|
504
|
+
and content[i] != ord("}")
|
|
505
|
+
):
|
|
506
|
+
i += 1
|
|
507
|
+
elif content[i] >= 32 and content[i] <= 126: # Printable ASCII
|
|
508
|
+
text += chr(content[i])
|
|
509
|
+
i += 1
|
|
510
|
+
|
|
511
|
+
metadata = {
|
|
512
|
+
"format": "RTF",
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
return text, metadata
|
|
516
|
+
except Exception as e:
|
|
517
|
+
logger.warning(f"Error extracting RTF: {e}")
|
|
518
|
+
return "", {}
|
|
519
|
+
|
|
520
|
+
def _extract_epub(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
521
|
+
"""Extract text from EPUB (ebook) file."""
|
|
522
|
+
if ebooklib is None:
|
|
523
|
+
logger.warning("ebooklib not installed, skipping EPUB extraction")
|
|
524
|
+
return "", {}
|
|
525
|
+
|
|
526
|
+
try:
|
|
527
|
+
book = epub.read_epub(file_path)
|
|
528
|
+
text_parts = []
|
|
529
|
+
|
|
530
|
+
# Extract text from all items
|
|
531
|
+
for item in book.get_items():
|
|
532
|
+
if item.get_type() == ebooklib.ITEM_DOCUMENT: # pyright: ignore[reportAttributeAccessIssue]
|
|
533
|
+
# Remove HTML tags
|
|
534
|
+
html_content = item.get_content().decode("utf-8") # pyright: ignore[reportAttributeAccessIssue]
|
|
535
|
+
import re
|
|
536
|
+
|
|
537
|
+
text = re.sub(r"<[^>]+>", " ", html_content)
|
|
538
|
+
text = html.unescape(text)
|
|
539
|
+
text_parts.append(text)
|
|
540
|
+
|
|
541
|
+
metadata = {
|
|
542
|
+
"title": book.get_metadata("DC", "title")[0][0] if book.get_metadata("DC", "title") else "", # pyright: ignore[reportAttributeAccessIssue]
|
|
543
|
+
"author": book.get_metadata("DC", "creator")[0][0] if book.get_metadata("DC", "creator") else "", # pyright: ignore[reportAttributeAccessIssue]
|
|
544
|
+
"format": "EPUB",
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
return "\n\n".join(text_parts), metadata
|
|
548
|
+
except Exception as e:
|
|
549
|
+
logger.warning(f"Error extracting EPUB: {e}")
|
|
550
|
+
return "", {}
|
|
551
|
+
|
|
552
|
+
def _extract_csv(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
553
|
+
"""Extract text from CSV file."""
|
|
554
|
+
try:
|
|
555
|
+
text_parts = []
|
|
556
|
+
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
|
557
|
+
reader = csv.reader(f)
|
|
558
|
+
for row in reader:
|
|
559
|
+
row_text = " | ".join(str(cell) for cell in row)
|
|
560
|
+
text_parts.append(row_text)
|
|
561
|
+
|
|
562
|
+
metadata = {
|
|
563
|
+
"format": "CSV",
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
return "\n".join(text_parts), metadata
|
|
567
|
+
except Exception as e:
|
|
568
|
+
logger.warning(f"Error extracting CSV: {e}")
|
|
569
|
+
return "", {}
|
|
570
|
+
|
|
571
|
+
def _extract_xml(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
572
|
+
"""Extract text from XML file."""
|
|
573
|
+
try:
|
|
574
|
+
tree = ET.parse(file_path)
|
|
575
|
+
root = tree.getroot()
|
|
576
|
+
|
|
577
|
+
# Extract all text content
|
|
578
|
+
text_parts = [elem.text for elem in root.iter() if elem.text and elem.text.strip()]
|
|
579
|
+
text = "\n".join(text_parts)
|
|
580
|
+
|
|
581
|
+
metadata = {
|
|
582
|
+
"format": "XML",
|
|
583
|
+
"root_tag": root.tag,
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
return text, metadata
|
|
587
|
+
except Exception as e:
|
|
588
|
+
logger.warning(f"Error extracting XML: {e}")
|
|
589
|
+
return "", {}
|
|
590
|
+
|
|
591
|
+
def _extract_html(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
592
|
+
"""Extract text from HTML file."""
|
|
593
|
+
try:
|
|
594
|
+
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
|
595
|
+
html_content = f.read()
|
|
596
|
+
|
|
597
|
+
# Remove HTML tags
|
|
598
|
+
import re
|
|
599
|
+
|
|
600
|
+
text = re.sub(r"<[^>]+>", " ", html_content)
|
|
601
|
+
text = html.unescape(text)
|
|
602
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
603
|
+
|
|
604
|
+
metadata = {
|
|
605
|
+
"format": "HTML",
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
return text, metadata
|
|
609
|
+
except Exception as e:
|
|
610
|
+
logger.warning(f"Error extracting HTML: {e}")
|
|
611
|
+
return "", {}
|
|
612
|
+
|
|
613
|
+
def _extract_markdown(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
614
|
+
"""Extract text from Markdown file."""
|
|
615
|
+
try:
|
|
616
|
+
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
|
617
|
+
content = f.read()
|
|
618
|
+
|
|
619
|
+
if markdown:
|
|
620
|
+
# Convert Markdown to HTML then extract text
|
|
621
|
+
html_content = markdown.markdown(content) # pyright: ignore[reportAttributeAccessIssue]
|
|
622
|
+
import re
|
|
623
|
+
|
|
624
|
+
text = re.sub(r"<[^>]+>", " ", html_content)
|
|
625
|
+
text = html.unescape(text)
|
|
626
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
627
|
+
else:
|
|
628
|
+
# Simple Markdown processing
|
|
629
|
+
text = content
|
|
630
|
+
|
|
631
|
+
metadata = {
|
|
632
|
+
"format": "Markdown",
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
return text, metadata
|
|
636
|
+
except Exception as e:
|
|
637
|
+
logger.warning(f"Error extracting Markdown: {e}")
|
|
638
|
+
return "", {}
|
|
639
|
+
|
|
640
|
+
def _extract_docx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
641
|
+
"""Extract text from DOCX file."""
|
|
642
|
+
if Document is None:
|
|
643
|
+
logger.warning("python-docx not installed, skipping DOCX extraction")
|
|
644
|
+
return "", {}
|
|
645
|
+
|
|
646
|
+
doc = Document(str(file_path))
|
|
647
|
+
text_parts = []
|
|
648
|
+
|
|
649
|
+
for paragraph in doc.paragraphs:
|
|
650
|
+
text_parts.append(paragraph.text)
|
|
651
|
+
|
|
652
|
+
# Extract tables
|
|
653
|
+
for table in doc.tables:
|
|
654
|
+
for row in table.rows:
|
|
655
|
+
row_text = " | ".join(cell.text for cell in row.cells)
|
|
656
|
+
text_parts.append(row_text)
|
|
657
|
+
|
|
658
|
+
metadata = {
|
|
659
|
+
"paragraph_count": len(doc.paragraphs),
|
|
660
|
+
"table_count": len(doc.tables),
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
return "\n".join(text_parts), metadata
|
|
664
|
+
|
|
665
|
+
def _extract_xlsx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
666
|
+
"""Extract text from XLSX file."""
|
|
667
|
+
if load_workbook is None:
|
|
668
|
+
logger.warning("openpyxl not installed, skipping XLSX extraction")
|
|
669
|
+
return "", {}
|
|
670
|
+
|
|
671
|
+
wb = load_workbook(file_path, read_only=True, data_only=True)
|
|
672
|
+
text_parts = []
|
|
673
|
+
|
|
674
|
+
for sheet_name in wb.sheetnames:
|
|
675
|
+
sheet = wb[sheet_name]
|
|
676
|
+
text_parts.append(f"[Sheet: {sheet_name}]")
|
|
677
|
+
for row in sheet.iter_rows(values_only=True):
|
|
678
|
+
row_text = " | ".join(str(cell) if cell is not None else "" for cell in row)
|
|
679
|
+
if row_text.strip():
|
|
680
|
+
text_parts.append(row_text)
|
|
681
|
+
|
|
682
|
+
metadata = {
|
|
683
|
+
"sheet_count": len(wb.sheetnames),
|
|
684
|
+
"sheets": wb.sheetnames,
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
return "\n".join(text_parts), metadata
|
|
688
|
+
|
|
689
|
+
def _extract_pptx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
690
|
+
"""Extract text from PPTX file."""
|
|
691
|
+
try:
|
|
692
|
+
from pptx import Presentation
|
|
693
|
+
except ImportError:
|
|
694
|
+
logger.warning("python-pptx not installed, skipping PPTX extraction")
|
|
695
|
+
return "", {}
|
|
696
|
+
|
|
697
|
+
prs = Presentation(str(file_path))
|
|
698
|
+
text_parts = []
|
|
699
|
+
|
|
700
|
+
for slide_num, slide in enumerate(prs.slides, 1):
|
|
701
|
+
text_parts.append(f"[Slide {slide_num}]")
|
|
702
|
+
for shape in slide.shapes:
|
|
703
|
+
if hasattr(shape, "text"):
|
|
704
|
+
text_parts.append(shape.text) # pyright: ignore[reportAttributeAccessIssue]
|
|
705
|
+
|
|
706
|
+
metadata = {
|
|
707
|
+
"slide_count": len(prs.slides),
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
return "\n".join(text_parts), metadata
|
|
711
|
+
|
|
712
|
+
def _extract_image(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
713
|
+
"""Extract text from image file using OCR."""
|
|
714
|
+
if Image is None or pytesseract is None:
|
|
715
|
+
logger.warning("PIL or pytesseract not installed, skipping image OCR")
|
|
716
|
+
return "", {}
|
|
717
|
+
|
|
718
|
+
try:
|
|
719
|
+
img = Image.open(file_path)
|
|
720
|
+
text = pytesseract.image_to_string(img)
|
|
721
|
+
|
|
722
|
+
metadata = {
|
|
723
|
+
"format": img.format,
|
|
724
|
+
"mode": img.mode,
|
|
725
|
+
"size": img.size,
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
return text, metadata
|
|
729
|
+
except Exception as e:
|
|
730
|
+
logger.warning(f"Could not perform OCR on {file_path}: {e}")
|
|
731
|
+
return "", {}
|
|
732
|
+
|
|
733
|
+
def _extract_text(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
734
|
+
"""Extract text from plain text file."""
|
|
735
|
+
encodings = ["utf-8", "latin-1", "cp1252", "utf-16"]
|
|
736
|
+
|
|
737
|
+
for encoding in encodings:
|
|
738
|
+
try:
|
|
739
|
+
with open(file_path, encoding=encoding, errors="ignore") as f:
|
|
740
|
+
text = f.read()
|
|
741
|
+
return text, {"encoding": encoding}
|
|
742
|
+
except UnicodeDecodeError:
|
|
743
|
+
continue
|
|
744
|
+
|
|
745
|
+
return "", {}
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def main():
|
|
749
|
+
"""Main entry point for document scanner."""
|
|
750
|
+
parser = argparse.ArgumentParser(
|
|
751
|
+
description="Scan documents and extract text, images, and metadata with high performance"
|
|
752
|
+
)
|
|
753
|
+
parser.add_argument("input", type=str, nargs="?", default=str(cwd), help="Input directory")
|
|
754
|
+
parser.add_argument("-r", "--rules", type=str, default="rules.json", help="Rules file (JSON)")
|
|
755
|
+
parser.add_argument("--recursive", action="store_true", help="Scan files recursively")
|
|
756
|
+
parser.add_argument(
|
|
757
|
+
"-f",
|
|
758
|
+
"--file-types",
|
|
759
|
+
help="File types to scan (comma-separated)",
|
|
760
|
+
default="pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md",
|
|
761
|
+
)
|
|
762
|
+
parser.add_argument("--use-pdf-ocr", help="Use PDF OCR for image-based PDFs", action="store_true")
|
|
763
|
+
parser.add_argument(
|
|
764
|
+
"--use-process-pool",
|
|
765
|
+
help="Use process pool instead of thread pool (better for CPU-intensive tasks)",
|
|
766
|
+
action="store_true",
|
|
767
|
+
)
|
|
768
|
+
parser.add_argument(
|
|
769
|
+
"-b",
|
|
770
|
+
"--batch-size",
|
|
771
|
+
help="Number of files to process in each batch",
|
|
772
|
+
default=50,
|
|
773
|
+
type=int,
|
|
774
|
+
)
|
|
775
|
+
parser.add_argument("-t", "--threads", help="Number of threads for parallel scanning", default=4, type=int)
|
|
776
|
+
parser.add_argument("--progress", help="Show progress bar", action="store_true")
|
|
777
|
+
parser.add_argument("-v", "--verbose", help="Verbose output", action="store_true")
|
|
778
|
+
args = parser.parse_args()
|
|
779
|
+
|
|
780
|
+
if args.verbose:
|
|
781
|
+
logger.setLevel(logging.DEBUG)
|
|
782
|
+
|
|
783
|
+
t0 = time.perf_counter()
|
|
784
|
+
# Validate input directory
|
|
785
|
+
input_dir = Path(args.input)
|
|
786
|
+
if not input_dir.exists() or not input_dir.is_dir():
|
|
787
|
+
logger.error(f"Input directory does not exist: {args.input}")
|
|
788
|
+
return
|
|
789
|
+
logger.info(f"Scanning directory: {input_dir}...")
|
|
790
|
+
|
|
791
|
+
# Load rules file
|
|
792
|
+
rules_file = Path(args.rules)
|
|
793
|
+
if not rules_file.exists() or not rules_file.is_file():
|
|
794
|
+
rule_files_in_input_dir = list(input_dir.glob("rules*.json"))
|
|
795
|
+
|
|
796
|
+
if rule_files_in_input_dir:
|
|
797
|
+
rules_file = rule_files_in_input_dir[0]
|
|
798
|
+
else:
|
|
799
|
+
logger.error(f"Rules file does not exist: {args.rules}")
|
|
800
|
+
return
|
|
801
|
+
logger.info(f"Using rules file: {rules_file}")
|
|
802
|
+
|
|
803
|
+
try:
|
|
804
|
+
with open(rules_file, encoding="utf-8") as f:
|
|
805
|
+
rules_data = json.load(f)
|
|
806
|
+
except json.JSONDecodeError as e:
|
|
807
|
+
logger.error(f"Invalid JSON in rules file: {e}")
|
|
808
|
+
return
|
|
809
|
+
|
|
810
|
+
# Parse rules
|
|
811
|
+
rules = []
|
|
812
|
+
if isinstance(rules_data, list):
|
|
813
|
+
rules = [Rule(rule) for rule in rules_data]
|
|
814
|
+
elif isinstance(rules_data, dict) and "rules" in rules_data:
|
|
815
|
+
rules = [Rule(rule) for rule in rules_data["rules"]]
|
|
816
|
+
else:
|
|
817
|
+
logger.error("Invalid rules format. Expected a list or dict with 'rules' key")
|
|
818
|
+
return
|
|
819
|
+
|
|
820
|
+
if not rules:
|
|
821
|
+
logger.error("No valid rules found")
|
|
822
|
+
return
|
|
823
|
+
|
|
824
|
+
# Parse file types
|
|
825
|
+
file_types = [ft.strip() for ft in args.file_types.split(",")]
|
|
826
|
+
|
|
827
|
+
# Create scanner and run scan
|
|
828
|
+
scanner = DocumentScanner(input_dir, rules, file_types, args.use_pdf_ocr, args.use_process_pool, args.batch_size)
|
|
829
|
+
results = scanner.scan(threads=args.threads, show_progress=args.progress)
|
|
830
|
+
|
|
831
|
+
# Save results to JSON file in input directory
|
|
832
|
+
output_file = input_dir / f"scan_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
833
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
834
|
+
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
835
|
+
|
|
836
|
+
logger.info(f"Results saved to: {output_file}")
|
|
837
|
+
logger.info(f"Total time elapsed: {time.perf_counter() - t0:.2f}s")
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
if __name__ == "__main__":
|
|
841
|
+
main()
|