pysfi 0.1.6__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pysfi-0.1.6.dist-info → pysfi-0.1.10.dist-info}/METADATA +28 -3
- pysfi-0.1.10.dist-info/RECORD +39 -0
- {pysfi-0.1.6.dist-info → pysfi-0.1.10.dist-info}/entry_points.txt +7 -1
- sfi/__init__.py +1 -1
- sfi/bumpversion/__init__.py +1 -1
- sfi/cli.py +11 -0
- sfi/docscan/__init__.py +3 -0
- sfi/docscan/docscan.py +1145 -0
- sfi/docscan/docscan_gui.py +1282 -0
- sfi/docscan/lang/__init__.py +0 -0
- sfi/docscan/lang/eng.py +152 -0
- sfi/docscan/lang/zhcn.py +170 -0
- sfi/embedinstall/__init__.py +0 -0
- sfi/embedinstall/embedinstall.py +77 -17
- sfi/makepython/makepython.py +44 -27
- sfi/pdfsplit/__init__.py +0 -0
- sfi/pdfsplit/pdfsplit.py +173 -173
- sfi/projectparse/__init__.py +0 -0
- sfi/pylibpack/__init__.py +0 -0
- sfi/pylibpack/pylibpack.py +913 -0
- sfi/pyloadergen/pyloadergen.py +697 -111
- sfi/pypack/__init__.py +0 -0
- sfi/pypack/pypack.py +791 -0
- sfi/pysourcepack/pysourcepack.py +369 -0
- sfi/taskkill/__init__.py +0 -0
- sfi/which/__init__.py +0 -0
- sfi/workflowengine/__init__.py +0 -0
- sfi/workflowengine/workflowengine.py +444 -0
- pysfi-0.1.6.dist-info/RECORD +0 -21
- sfi/pypacker/fspacker.py +0 -91
- {pysfi-0.1.6.dist-info → pysfi-0.1.10.dist-info}/WHEEL +0 -0
sfi/docscan/docscan.py
ADDED
|
@@ -0,0 +1,1145 @@
|
|
|
1
|
+
"""Scan documents and extract text, images, and metadata with certain rules."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import contextlib
|
|
7
|
+
import csv
|
|
8
|
+
import html
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
import re
|
|
12
|
+
import sys
|
|
13
|
+
import threading
|
|
14
|
+
import time
|
|
15
|
+
import xml.etree.ElementTree as ET
|
|
16
|
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, Callable
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import fitz # PyMuPDF
|
|
23
|
+
except ImportError:
|
|
24
|
+
fitz = None
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
from docx import Document
|
|
28
|
+
except ImportError:
|
|
29
|
+
Document = None
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
from openpyxl import load_workbook
|
|
33
|
+
except ImportError:
|
|
34
|
+
load_workbook = None
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
from PIL import Image
|
|
38
|
+
except ImportError:
|
|
39
|
+
Image = None
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
import pytesseract
|
|
43
|
+
except ImportError:
|
|
44
|
+
pytesseract = None
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
import odf.opendocument as odf_odt # ODT support
|
|
48
|
+
except ImportError:
|
|
49
|
+
odf_odt = None
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
import ebooklib # EPUB support
|
|
53
|
+
from ebooklib import epub
|
|
54
|
+
except ImportError:
|
|
55
|
+
ebooklib = None
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
import markdown # Markdown to text
|
|
59
|
+
except ImportError:
|
|
60
|
+
markdown = None
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
import pypdf # Alternative PDF library
|
|
64
|
+
except ImportError:
|
|
65
|
+
pypdf = None
|
|
66
|
+
|
|
67
|
+
# Language support imports
|
|
68
|
+
try:
|
|
69
|
+
from sfi.docscan.lang.eng import ENGLISH_DEFAULTS as EN_TRANSLATIONS
|
|
70
|
+
from sfi.docscan.lang.zhcn import TRANSLATIONS as ZH_TRANSLATIONS
|
|
71
|
+
except ImportError:
|
|
72
|
+
try:
|
|
73
|
+
from lang.eng import ENGLISH_DEFAULTS as EN_TRANSLATIONS
|
|
74
|
+
from lang.zhcn import TRANSLATIONS as ZH_TRANSLATIONS
|
|
75
|
+
except ImportError:
|
|
76
|
+
# Fallback translations if import fails
|
|
77
|
+
ZH_TRANSLATIONS = {}
|
|
78
|
+
EN_TRANSLATIONS = {}
|
|
79
|
+
|
|
80
|
+
# Global language setting
|
|
81
|
+
USE_CHINESE = True # Default to Chinese
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def t(key: str, **kwargs) -> str:
|
|
85
|
+
"""Get translated text for the given key.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
key: Translation key
|
|
89
|
+
**kwargs: Arguments for string formatting
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Translated text
|
|
93
|
+
"""
|
|
94
|
+
text = ZH_TRANSLATIONS.get(key, key) if USE_CHINESE else EN_TRANSLATIONS.get(key, key)
|
|
95
|
+
|
|
96
|
+
# Format with kwargs if provided
|
|
97
|
+
if kwargs:
|
|
98
|
+
with contextlib.suppress(KeyError, ValueError):
|
|
99
|
+
text = text.format(**kwargs)
|
|
100
|
+
return text
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
104
|
+
logger = logging.getLogger(__name__)
|
|
105
|
+
cwd = Path.cwd()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class Rule:
|
|
109
|
+
"""Represents a scanning rule with optimized pattern matching."""
|
|
110
|
+
|
|
111
|
+
def __init__(self, rule_data: dict[str, Any]):
|
|
112
|
+
"""Initialize rule from dictionary."""
|
|
113
|
+
self.name = rule_data.get("name", "")
|
|
114
|
+
self.pattern = rule_data.get("pattern", "")
|
|
115
|
+
self.is_regex = rule_data.get("regex", False)
|
|
116
|
+
self.case_sensitive = rule_data.get("case_sensitive", False)
|
|
117
|
+
self.context_lines = rule_data.get("context_lines", 3)
|
|
118
|
+
self.description = rule_data.get("description", "")
|
|
119
|
+
|
|
120
|
+
if self.is_regex:
|
|
121
|
+
flags = 0 if self.case_sensitive else re.IGNORECASE
|
|
122
|
+
try:
|
|
123
|
+
# Use re.ASCII for faster matching when possible
|
|
124
|
+
self.compiled_pattern = re.compile(self.pattern, flags | re.ASCII)
|
|
125
|
+
except re.error as e:
|
|
126
|
+
logger.warning(t("invalid_regex_pattern", pattern=self.pattern, error=e))
|
|
127
|
+
self.compiled_pattern = None
|
|
128
|
+
else:
|
|
129
|
+
self.compiled_pattern = None
|
|
130
|
+
|
|
131
|
+
def search(self, text: str) -> list[dict[str, Any]]:
|
|
132
|
+
"""Search for pattern in text and return matches."""
|
|
133
|
+
if not text or not self.pattern:
|
|
134
|
+
return []
|
|
135
|
+
|
|
136
|
+
matches = []
|
|
137
|
+
lines = text.split("\n")
|
|
138
|
+
|
|
139
|
+
if self.is_regex and self.compiled_pattern:
|
|
140
|
+
# Regex search
|
|
141
|
+
for line_num, line in enumerate(lines, 1):
|
|
142
|
+
for match in self.compiled_pattern.finditer(line):
|
|
143
|
+
matches.append({
|
|
144
|
+
"type": "regex",
|
|
145
|
+
"line_number": line_num,
|
|
146
|
+
"match": match.group(),
|
|
147
|
+
"start": match.start(),
|
|
148
|
+
"end": match.end(),
|
|
149
|
+
"context": self._get_context(lines, line_num - 1),
|
|
150
|
+
})
|
|
151
|
+
else:
|
|
152
|
+
# Simple text search
|
|
153
|
+
search_text = self.pattern if self.case_sensitive else self.pattern.lower()
|
|
154
|
+
for line_num, line in enumerate(lines, 1):
|
|
155
|
+
compare_line = line if self.case_sensitive else line.lower()
|
|
156
|
+
start = 0
|
|
157
|
+
while True:
|
|
158
|
+
pos = compare_line.find(search_text, start)
|
|
159
|
+
if pos == -1:
|
|
160
|
+
break
|
|
161
|
+
matches.append({
|
|
162
|
+
"type": "text",
|
|
163
|
+
"line_number": line_num,
|
|
164
|
+
"match": line[pos : pos + len(self.pattern)],
|
|
165
|
+
"start": pos,
|
|
166
|
+
"end": pos + len(self.pattern),
|
|
167
|
+
"context": self._get_context(lines, line_num - 1),
|
|
168
|
+
})
|
|
169
|
+
start = pos + 1
|
|
170
|
+
|
|
171
|
+
return matches
|
|
172
|
+
|
|
173
|
+
def _get_context(self, lines: list[str], line_index: int) -> list[str]:
|
|
174
|
+
"""Get context lines around a match."""
|
|
175
|
+
start = max(0, line_index - self.context_lines)
|
|
176
|
+
end = min(len(lines), line_index + self.context_lines + 1)
|
|
177
|
+
return lines[start:end]
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class DocumentScanner:
|
|
181
|
+
"""High-performance document scanner with multi-format support."""
|
|
182
|
+
|
|
183
|
+
def __init__(
|
|
184
|
+
self,
|
|
185
|
+
input_dir: Path,
|
|
186
|
+
rules: list[Rule],
|
|
187
|
+
file_types: list[str],
|
|
188
|
+
use_pdf_ocr: bool = False,
|
|
189
|
+
use_process_pool: bool = False,
|
|
190
|
+
batch_size: int = 50,
|
|
191
|
+
):
|
|
192
|
+
"""Initialize scanner with input directory and rules.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
input_dir: Directory containing documents to scan
|
|
196
|
+
rules: List of scanning rules
|
|
197
|
+
file_types: List of file extensions to scan
|
|
198
|
+
use_pdf_ocr: Use OCR for PDF files
|
|
199
|
+
use_process_pool: Use process pool instead of thread pool for CPU-intensive tasks
|
|
200
|
+
batch_size: Number of files to process in each batch
|
|
201
|
+
"""
|
|
202
|
+
self.input_dir = Path(input_dir)
|
|
203
|
+
self.rules = rules
|
|
204
|
+
self.file_types = file_types
|
|
205
|
+
self.use_pdf_ocr = use_pdf_ocr
|
|
206
|
+
self.use_process_pool = use_process_pool
|
|
207
|
+
self.batch_size = batch_size
|
|
208
|
+
self.results = []
|
|
209
|
+
self.paused = False
|
|
210
|
+
self.paused_event = threading.Event()
|
|
211
|
+
self.paused_event.set() # Initially not paused
|
|
212
|
+
self.stopped = False
|
|
213
|
+
self._progress_callback = None
|
|
214
|
+
self._executor = None # Keep reference to executor for forced shutdown
|
|
215
|
+
|
|
216
|
+
def set_progress_callback(self, callback: Callable[[int, int], None]) -> None:
|
|
217
|
+
"""Set callback function for progress updates.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
callback: Function to call with progress (current, total)
|
|
221
|
+
"""
|
|
222
|
+
self._progress_callback = callback
|
|
223
|
+
|
|
224
|
+
def pause(self) -> None:
|
|
225
|
+
"""Pause the scanning process."""
|
|
226
|
+
self.paused = True
|
|
227
|
+
self.paused_event.clear()
|
|
228
|
+
|
|
229
|
+
def resume(self) -> None:
|
|
230
|
+
"""Resume the scanning process."""
|
|
231
|
+
self.paused = False
|
|
232
|
+
self.paused_event.set()
|
|
233
|
+
logger.info(t("scan_resumed"))
|
|
234
|
+
|
|
235
|
+
def stop(self) -> None:
|
|
236
|
+
"""Stop the scanning process."""
|
|
237
|
+
self.stopped = True
|
|
238
|
+
self.paused_event.set() # Ensure thread can exit
|
|
239
|
+
logger.info(t("scan_stopped"))
|
|
240
|
+
|
|
241
|
+
def is_paused(self) -> bool:
|
|
242
|
+
"""Check if the scanner is paused."""
|
|
243
|
+
return self.paused
|
|
244
|
+
|
|
245
|
+
def is_stopped(self) -> bool:
|
|
246
|
+
"""Check if the scanner is stopped."""
|
|
247
|
+
return self.stopped
|
|
248
|
+
|
|
249
|
+
def scan(self, threads: int = 4, show_progress: bool = False) -> dict[str, Any]:
|
|
250
|
+
"""Scan all documents in input directory.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
threads: Number of worker threads/processes
|
|
254
|
+
show_progress: Show progress bar
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Dictionary containing scan results
|
|
258
|
+
"""
|
|
259
|
+
self.stopped = False
|
|
260
|
+
self.paused = False
|
|
261
|
+
self.paused_event.set()
|
|
262
|
+
|
|
263
|
+
logger.info(t("scanning_directory", directory=str(self.input_dir)))
|
|
264
|
+
files = self._collect_files()
|
|
265
|
+
logger.info(t("found_files_to_scan", count=len(files)))
|
|
266
|
+
|
|
267
|
+
results = {
|
|
268
|
+
"scan_info": {
|
|
269
|
+
"input_directory": str(self.input_dir),
|
|
270
|
+
"scan_time": datetime.now().isoformat(),
|
|
271
|
+
"file_types_scanned": self.file_types,
|
|
272
|
+
"total_files": len(files),
|
|
273
|
+
"rules_count": len(self.rules),
|
|
274
|
+
"use_pdf_ocr": self.use_pdf_ocr,
|
|
275
|
+
"use_process_pool": self.use_process_pool,
|
|
276
|
+
},
|
|
277
|
+
"rules": [{"name": r.name, "pattern": r.pattern, "is_regex": r.is_regex} for r in self.rules],
|
|
278
|
+
"matches": [],
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
# Scan files in parallel
|
|
282
|
+
processed = 0
|
|
283
|
+
executor_class = ProcessPoolExecutor if self.use_process_pool else ThreadPoolExecutor
|
|
284
|
+
executor = executor_class(max_workers=threads)
|
|
285
|
+
self._executor = executor # Keep reference for forced shutdown
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
# Submit futures one by one to respect pause state
|
|
289
|
+
submitted_futures = []
|
|
290
|
+
was_paused = False # Track previous pause state
|
|
291
|
+
|
|
292
|
+
for file in files:
|
|
293
|
+
# Check if stopped before submitting more tasks
|
|
294
|
+
if self.stopped:
|
|
295
|
+
logger.info(t("scan_stopped_before_submitting_tasks"))
|
|
296
|
+
break
|
|
297
|
+
|
|
298
|
+
# Wait if paused before submitting new tasks
|
|
299
|
+
while self.paused:
|
|
300
|
+
# Log when entering paused state
|
|
301
|
+
if not was_paused:
|
|
302
|
+
logger.info(t("scan_paused"))
|
|
303
|
+
was_paused = True
|
|
304
|
+
|
|
305
|
+
self.paused_event.wait(0.1)
|
|
306
|
+
if self.stopped:
|
|
307
|
+
logger.info(t("scan_stopped_while_paused"))
|
|
308
|
+
break
|
|
309
|
+
|
|
310
|
+
# Log when exiting paused state
|
|
311
|
+
if was_paused and not self.paused:
|
|
312
|
+
logger.info(t("scan_resumed"))
|
|
313
|
+
was_paused = False
|
|
314
|
+
|
|
315
|
+
if self.stopped:
|
|
316
|
+
break
|
|
317
|
+
|
|
318
|
+
future = executor.submit(self._scan_file_with_pause_check, file)
|
|
319
|
+
submitted_futures.append(future)
|
|
320
|
+
|
|
321
|
+
# Process completed futures
|
|
322
|
+
for future in as_completed(submitted_futures):
|
|
323
|
+
# Check if stopped before processing this future
|
|
324
|
+
if self.stopped:
|
|
325
|
+
logger.info(t("scan_stopped_by_user_canceling_tasks"))
|
|
326
|
+
# Cancel all remaining futures
|
|
327
|
+
for f in submitted_futures:
|
|
328
|
+
if not f.done():
|
|
329
|
+
f.cancel()
|
|
330
|
+
break
|
|
331
|
+
|
|
332
|
+
# Wait if paused before processing result
|
|
333
|
+
while self.paused:
|
|
334
|
+
# Log when entering paused state
|
|
335
|
+
if not was_paused:
|
|
336
|
+
logger.info(t("scan_paused"))
|
|
337
|
+
was_paused = True
|
|
338
|
+
|
|
339
|
+
self.paused_event.wait(0.1)
|
|
340
|
+
if self.stopped:
|
|
341
|
+
logger.info(t("scan_stopped_while_paused"))
|
|
342
|
+
break
|
|
343
|
+
|
|
344
|
+
# Log when exiting paused state
|
|
345
|
+
if was_paused and not self.paused:
|
|
346
|
+
logger.info(t("scan_resumed"))
|
|
347
|
+
was_paused = False
|
|
348
|
+
|
|
349
|
+
if self.stopped:
|
|
350
|
+
break
|
|
351
|
+
|
|
352
|
+
try:
|
|
353
|
+
file_result = future.result(timeout=1.0) # Short timeout to allow quick stop
|
|
354
|
+
if file_result and file_result["matches"]:
|
|
355
|
+
results["matches"].append(file_result)
|
|
356
|
+
logger.info(t("found_matches_in_file", file_name=Path(file_result.get("file_path", "")).name))
|
|
357
|
+
except TimeoutError:
|
|
358
|
+
logger.warning(t("task_timeout_scan_may_be_stopping"))
|
|
359
|
+
if self.stopped:
|
|
360
|
+
break
|
|
361
|
+
except Exception as e:
|
|
362
|
+
if not self.stopped:
|
|
363
|
+
logger.error(t("error_scanning_file", error=e))
|
|
364
|
+
|
|
365
|
+
processed += 1
|
|
366
|
+
|
|
367
|
+
# Report progress
|
|
368
|
+
if show_progress and processed % 10 == 0:
|
|
369
|
+
logger.info(t("progress_report", processed=processed, total=len(files)))
|
|
370
|
+
|
|
371
|
+
# Call progress callback if set
|
|
372
|
+
if self._progress_callback:
|
|
373
|
+
self._progress_callback(processed, len(files))
|
|
374
|
+
|
|
375
|
+
finally:
|
|
376
|
+
# Force shutdown if stopped
|
|
377
|
+
if self.stopped and self._executor:
|
|
378
|
+
logger.info(t("force_shutting_down_executor"))
|
|
379
|
+
if sys.version_info >= (3, 9):
|
|
380
|
+
self._executor.shutdown(wait=False, cancel_futures=True)
|
|
381
|
+
else:
|
|
382
|
+
self._executor.shutdown(wait=False)
|
|
383
|
+
else:
|
|
384
|
+
self._executor.shutdown(wait=True)
|
|
385
|
+
self._executor = None
|
|
386
|
+
|
|
387
|
+
results["scan_info"]["files_with_matches"] = len(results["matches"])
|
|
388
|
+
results["scan_info"]["files_processed"] = processed
|
|
389
|
+
results["stopped"] = self.stopped
|
|
390
|
+
|
|
391
|
+
if self.stopped:
|
|
392
|
+
logger.info(t("scan_stopped_processed_files", processed=processed))
|
|
393
|
+
else:
|
|
394
|
+
logger.info(t("scan_complete_found_matches", matches_count=len(results["matches"])))
|
|
395
|
+
|
|
396
|
+
return results
|
|
397
|
+
|
|
398
|
+
def _scan_file_with_pause_check(self, file_path: Path) -> dict[str, Any]:
|
|
399
|
+
"""Scan a single file with pause check."""
|
|
400
|
+
# Check if stopped before processing
|
|
401
|
+
if self.stopped:
|
|
402
|
+
return {}
|
|
403
|
+
|
|
404
|
+
# Check if paused before processing
|
|
405
|
+
while self.paused:
|
|
406
|
+
self.paused_event.wait(0.1)
|
|
407
|
+
if self.stopped:
|
|
408
|
+
return {}
|
|
409
|
+
|
|
410
|
+
return self._scan_file(file_path)
|
|
411
|
+
|
|
412
|
+
def _collect_files(self) -> list[Path]:
|
|
413
|
+
"""Collect all files matching the specified types."""
|
|
414
|
+
files = []
|
|
415
|
+
image_extensions = ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]
|
|
416
|
+
|
|
417
|
+
for ext in self.file_types:
|
|
418
|
+
# If extension is an image format and OCR is not enabled, skip
|
|
419
|
+
if ext.lower() in image_extensions and not self.use_pdf_ocr:
|
|
420
|
+
continue
|
|
421
|
+
files.extend(self.input_dir.rglob(f"*.{ext.lower()}"))
|
|
422
|
+
files.extend(self.input_dir.rglob(f"*.{ext.upper()}"))
|
|
423
|
+
return list(set(files)) # Remove duplicates
|
|
424
|
+
|
|
425
|
+
def _scan_file(self, file_path: Path) -> dict[str, Any]:
|
|
426
|
+
"""Scan a single file and return matches."""
|
|
427
|
+
# Check if stopped before starting
|
|
428
|
+
if self.stopped:
|
|
429
|
+
return {}
|
|
430
|
+
|
|
431
|
+
# Check if paused before starting
|
|
432
|
+
while self.paused:
|
|
433
|
+
self.paused_event.wait(0.1)
|
|
434
|
+
if self.stopped:
|
|
435
|
+
return {}
|
|
436
|
+
|
|
437
|
+
file_start_time = time.perf_counter()
|
|
438
|
+
ext = file_path.suffix.lower().lstrip(".")
|
|
439
|
+
text = ""
|
|
440
|
+
metadata = {}
|
|
441
|
+
|
|
442
|
+
try:
|
|
443
|
+
# Check if stopped before extraction
|
|
444
|
+
if self.stopped:
|
|
445
|
+
return {}
|
|
446
|
+
|
|
447
|
+
# Check if paused before extraction
|
|
448
|
+
while self.paused:
|
|
449
|
+
self.paused_event.wait(0.1)
|
|
450
|
+
if self.stopped:
|
|
451
|
+
return {}
|
|
452
|
+
|
|
453
|
+
# Route to appropriate extractor
|
|
454
|
+
if ext == "pdf":
|
|
455
|
+
text, metadata = self._extract_pdf(file_path)
|
|
456
|
+
elif ext == "odt":
|
|
457
|
+
text, metadata = self._extract_odt(file_path)
|
|
458
|
+
elif ext == "rtf":
|
|
459
|
+
text, metadata = self._extract_rtf(file_path)
|
|
460
|
+
elif ext == "epub":
|
|
461
|
+
text, metadata = self._extract_epub(file_path)
|
|
462
|
+
elif ext == "csv":
|
|
463
|
+
text, metadata = self._extract_csv(file_path)
|
|
464
|
+
elif ext == "xml":
|
|
465
|
+
text, metadata = self._extract_xml(file_path)
|
|
466
|
+
elif ext == "html" or ext == "htm":
|
|
467
|
+
text, metadata = self._extract_html(file_path)
|
|
468
|
+
elif ext == "md":
|
|
469
|
+
text, metadata = self._extract_markdown(file_path)
|
|
470
|
+
elif ext in ["docx", "doc"]:
|
|
471
|
+
text, metadata = self._extract_docx(file_path)
|
|
472
|
+
elif ext in ["xlsx", "xls"]:
|
|
473
|
+
text, metadata = self._extract_xlsx(file_path)
|
|
474
|
+
elif ext in ["pptx", "ppt"]:
|
|
475
|
+
text, metadata = self._extract_pptx(file_path)
|
|
476
|
+
elif ext in ["jpg", "jpeg", "png", "gif", "bmp", "tiff"]:
|
|
477
|
+
# Only extract images if OCR is enabled
|
|
478
|
+
if self.use_pdf_ocr: # Using the same flag for consistency
|
|
479
|
+
text, metadata = self._extract_image(file_path)
|
|
480
|
+
else:
|
|
481
|
+
return {} # Skip image files if OCR is disabled
|
|
482
|
+
else:
|
|
483
|
+
text, metadata = self._extract_text(file_path)
|
|
484
|
+
|
|
485
|
+
# Check if stopped after extraction
|
|
486
|
+
if self.stopped:
|
|
487
|
+
return {}
|
|
488
|
+
|
|
489
|
+
# Check if paused after extraction
|
|
490
|
+
while self.paused:
|
|
491
|
+
self.paused_event.wait(0.1)
|
|
492
|
+
if self.stopped:
|
|
493
|
+
return {}
|
|
494
|
+
|
|
495
|
+
except Exception as e:
|
|
496
|
+
logger.warning(t("could_not_extract_text_from_file", file_path=file_path, error=e))
|
|
497
|
+
return {}
|
|
498
|
+
|
|
499
|
+
processing_time = time.perf_counter() - file_start_time
|
|
500
|
+
|
|
501
|
+
if not text:
|
|
502
|
+
return {}
|
|
503
|
+
|
|
504
|
+
# Apply all rules with stop check
|
|
505
|
+
file_matches = []
|
|
506
|
+
for rule in self.rules:
|
|
507
|
+
if self.stopped:
|
|
508
|
+
return {}
|
|
509
|
+
# Check if paused before each rule
|
|
510
|
+
while self.paused:
|
|
511
|
+
self.paused_event.wait(0.1)
|
|
512
|
+
if self.stopped:
|
|
513
|
+
return {}
|
|
514
|
+
rule_matches = rule.search(text)
|
|
515
|
+
if rule_matches:
|
|
516
|
+
for match in rule_matches:
|
|
517
|
+
match["rule_name"] = rule.name
|
|
518
|
+
match["rule_description"] = rule.description
|
|
519
|
+
file_matches.extend(rule_matches)
|
|
520
|
+
|
|
521
|
+
if not file_matches:
|
|
522
|
+
return {}
|
|
523
|
+
|
|
524
|
+
# Add processing time to metadata
|
|
525
|
+
metadata["processing_time_seconds"] = round(processing_time, 3)
|
|
526
|
+
|
|
527
|
+
logger.info(
|
|
528
|
+
t(
|
|
529
|
+
"processed_file_info",
|
|
530
|
+
file_name=file_path.name,
|
|
531
|
+
ext=ext,
|
|
532
|
+
time=round(processing_time, 3),
|
|
533
|
+
matches_count=len(file_matches),
|
|
534
|
+
)
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
return {
|
|
538
|
+
"file_path": str(file_path),
|
|
539
|
+
"file_type": ext,
|
|
540
|
+
"file_size": file_path.stat().st_size,
|
|
541
|
+
"metadata": metadata,
|
|
542
|
+
"matches": file_matches,
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
def _extract_pdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
546
|
+
"""Extract text from PDF file with fallback."""
|
|
547
|
+
# Try PyMuPDF first (faster)
|
|
548
|
+
if fitz is not None:
|
|
549
|
+
try:
|
|
550
|
+
return self._extract_pdf_fitz(file_path)
|
|
551
|
+
except Exception as e:
|
|
552
|
+
logger.warning(t("pymupdf_failed_for_file", file_name=file_path.name, error=e))
|
|
553
|
+
|
|
554
|
+
# Fallback to pypdf
|
|
555
|
+
if pypdf is not None:
|
|
556
|
+
try:
|
|
557
|
+
return self._extract_pdf_pypdf(file_path)
|
|
558
|
+
except Exception as e:
|
|
559
|
+
logger.error(t("pypdf_also_failed_for_file", file_name=file_path.name, error=e))
|
|
560
|
+
return "", {}
|
|
561
|
+
|
|
562
|
+
logger.warning(t("no_pdf_library_installed"))
|
|
563
|
+
return "", {}
|
|
564
|
+
|
|
565
|
+
def _extract_pdf_fitz(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
566
|
+
"""Extract text from PDF using PyMuPDF (fastest method)."""
|
|
567
|
+
if not fitz:
|
|
568
|
+
logger.warning(t("pymupdf_not_installed"))
|
|
569
|
+
return "", {}
|
|
570
|
+
|
|
571
|
+
doc = None
|
|
572
|
+
try:
|
|
573
|
+
doc = fitz.open(str(file_path))
|
|
574
|
+
if doc.page_count == 0:
|
|
575
|
+
logger.warning(t("no_pages_found_in_file", file_path=file_path))
|
|
576
|
+
return "", {}
|
|
577
|
+
if not doc.metadata:
|
|
578
|
+
logger.warning(t("no_metadata_found_in_file", file_path=file_path))
|
|
579
|
+
return "", {}
|
|
580
|
+
|
|
581
|
+
text_parts = []
|
|
582
|
+
metadata = {
|
|
583
|
+
"page_count": doc.page_count,
|
|
584
|
+
"title": doc.metadata.get("title", ""),
|
|
585
|
+
"author": doc.metadata.get("author", ""),
|
|
586
|
+
"subject": doc.metadata.get("subject", ""),
|
|
587
|
+
"creator": doc.metadata.get("creator", ""),
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
if self.use_pdf_ocr and pytesseract and Image:
|
|
591
|
+
# OCR for image-based PDFs
|
|
592
|
+
import io
|
|
593
|
+
|
|
594
|
+
for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
|
|
595
|
+
# Check if stopped before processing each page
|
|
596
|
+
if self.stopped:
|
|
597
|
+
doc.close()
|
|
598
|
+
return "", {}
|
|
599
|
+
|
|
600
|
+
# Check if paused before processing each page
|
|
601
|
+
while self.paused:
|
|
602
|
+
self.paused_event.wait(0.1)
|
|
603
|
+
if self.stopped:
|
|
604
|
+
doc.close()
|
|
605
|
+
return "", {}
|
|
606
|
+
|
|
607
|
+
pix = page.get_pixmap()
|
|
608
|
+
img_data = pix.tobytes("png")
|
|
609
|
+
image = Image.open(io.BytesIO(img_data))
|
|
610
|
+
text = pytesseract.image_to_string(image)
|
|
611
|
+
text_parts.append(f"[Page {page_num}]\n{text}")
|
|
612
|
+
else:
|
|
613
|
+
# Extract text directly (faster)
|
|
614
|
+
for page_num, page in enumerate(doc, 1): # pyright: ignore[reportArgumentType]
|
|
615
|
+
# Check if stopped before processing each page
|
|
616
|
+
if self.stopped:
|
|
617
|
+
doc.close()
|
|
618
|
+
return "", {}
|
|
619
|
+
|
|
620
|
+
# Check if paused before processing each page
|
|
621
|
+
while self.paused:
|
|
622
|
+
self.paused_event.wait(0.1)
|
|
623
|
+
if self.stopped:
|
|
624
|
+
doc.close()
|
|
625
|
+
return "", {}
|
|
626
|
+
|
|
627
|
+
text = page.get_text()
|
|
628
|
+
text_parts.append(f"[Page {page_num}]\n{text}")
|
|
629
|
+
|
|
630
|
+
doc.close()
|
|
631
|
+
return "\n\n".join(text_parts), metadata
|
|
632
|
+
except Exception as e:
|
|
633
|
+
if doc:
|
|
634
|
+
doc.close()
|
|
635
|
+
logger.warning(t("pymupdf_error_trying_fallback", file_path=file_path, error=e))
|
|
636
|
+
# Re-raise to trigger fallback to pypdf
|
|
637
|
+
raise
|
|
638
|
+
|
|
639
|
+
def _extract_pdf_pypdf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
640
|
+
"""Extract text from PDF using pypdf (fallback method)."""
|
|
641
|
+
if not pypdf:
|
|
642
|
+
logger.warning(t("pypdf_not_installed_skipping_extraction"))
|
|
643
|
+
return "", {}
|
|
644
|
+
|
|
645
|
+
text_parts = []
|
|
646
|
+
metadata = {}
|
|
647
|
+
try:
|
|
648
|
+
with open(file_path, "rb") as f:
|
|
649
|
+
pdf_reader = pypdf.PdfReader(f)
|
|
650
|
+
|
|
651
|
+
if not pdf_reader.metadata:
|
|
652
|
+
logger.warning(t("no_metadata_found_in_file", file_path=file_path))
|
|
653
|
+
return "", {}
|
|
654
|
+
|
|
655
|
+
metadata = {
|
|
656
|
+
"page_count": len(pdf_reader.pages),
|
|
657
|
+
"title": pdf_reader.metadata.get("/Title", ""),
|
|
658
|
+
"author": pdf_reader.metadata.get("/Author", ""),
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
for page_num, page in enumerate(pdf_reader.pages, 1):
|
|
662
|
+
# Check if stopped before processing each page
|
|
663
|
+
if self.stopped:
|
|
664
|
+
return "", {}
|
|
665
|
+
|
|
666
|
+
# Check if paused before processing each page
|
|
667
|
+
while self.paused:
|
|
668
|
+
self.paused_event.wait(0.1)
|
|
669
|
+
if self.stopped:
|
|
670
|
+
return "", {}
|
|
671
|
+
|
|
672
|
+
text = page.extract_text()
|
|
673
|
+
text_parts.append(f"[Page {page_num}]\n{text}")
|
|
674
|
+
|
|
675
|
+
except Exception as e:
|
|
676
|
+
logger.warning(t("error_extracting_pdf_with_pypdf", error=e))
|
|
677
|
+
return "", {}
|
|
678
|
+
|
|
679
|
+
return "\n\n".join(text_parts), metadata
|
|
680
|
+
|
|
681
|
+
def _extract_odt(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
682
|
+
"""Extract text from ODT (OpenDocument Text) file."""
|
|
683
|
+
if odf_odt is None:
|
|
684
|
+
logger.warning(t("odfpy_not_installed_skipping_extraction"))
|
|
685
|
+
return "", {}
|
|
686
|
+
|
|
687
|
+
try:
|
|
688
|
+
doc = odf_odt.load(file_path)
|
|
689
|
+
text = doc.textual_content # pyright: ignore[reportAttributeAccessIssue]
|
|
690
|
+
|
|
691
|
+
metadata = {
|
|
692
|
+
"format": "ODT",
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
return text, metadata
|
|
696
|
+
except Exception as e:
|
|
697
|
+
logger.warning(t("error_extracting_odt", error=e))
|
|
698
|
+
return "", {}
|
|
699
|
+
|
|
700
|
+
def _extract_rtf(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
701
|
+
"""Extract text from RTF (Rich Text Format) file."""
|
|
702
|
+
try:
|
|
703
|
+
with open(file_path, "rb") as f:
|
|
704
|
+
content = f.read()
|
|
705
|
+
|
|
706
|
+
# Simple RTF text extraction (removes control words)
|
|
707
|
+
text = ""
|
|
708
|
+
i = 0
|
|
709
|
+
while i < len(content):
|
|
710
|
+
if content[i] == ord("\\") and i + 1 < len(content):
|
|
711
|
+
if content[i + 1] in [ord("'"), ord("*"), ord("\\")]:
|
|
712
|
+
i += 2
|
|
713
|
+
continue
|
|
714
|
+
# Skip control words
|
|
715
|
+
while (
|
|
716
|
+
i < len(content)
|
|
717
|
+
and content[i] != ord(" ")
|
|
718
|
+
and content[i] != ord("{")
|
|
719
|
+
and content[i] != ord("}")
|
|
720
|
+
):
|
|
721
|
+
i += 1
|
|
722
|
+
elif content[i] >= 32 and content[i] <= 126: # Printable ASCII
|
|
723
|
+
text += chr(content[i])
|
|
724
|
+
i += 1
|
|
725
|
+
|
|
726
|
+
metadata = {
|
|
727
|
+
"format": "RTF",
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
return text, metadata
|
|
731
|
+
except Exception as e:
|
|
732
|
+
logger.warning(t("error_extracting_rtf", error=e))
|
|
733
|
+
return "", {}
|
|
734
|
+
|
|
735
|
+
def _extract_epub(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
736
|
+
"""Extract text from EPUB (ebook) file."""
|
|
737
|
+
if ebooklib is None:
|
|
738
|
+
logger.warning(t("ebooklib_not_installed_skipping_extraction"))
|
|
739
|
+
return "", {}
|
|
740
|
+
|
|
741
|
+
try:
|
|
742
|
+
book = epub.read_epub(file_path)
|
|
743
|
+
text_parts = []
|
|
744
|
+
|
|
745
|
+
# Extract text from all items
|
|
746
|
+
for item in book.get_items():
|
|
747
|
+
# Check if stopped before processing each item
|
|
748
|
+
if self.stopped:
|
|
749
|
+
return "", {}
|
|
750
|
+
|
|
751
|
+
# Check if paused before processing each item
|
|
752
|
+
while self.paused:
|
|
753
|
+
self.paused_event.wait(0.1)
|
|
754
|
+
if self.stopped:
|
|
755
|
+
return "", {}
|
|
756
|
+
|
|
757
|
+
if item.get_type() == ebooklib.ITEM_DOCUMENT: # pyright: ignore[reportAttributeAccessIssue]
|
|
758
|
+
# Remove HTML tags
|
|
759
|
+
html_content = item.get_content().decode("utf-8") # pyright: ignore[reportAttributeAccessIssue]
|
|
760
|
+
import re
|
|
761
|
+
|
|
762
|
+
text = re.sub(r"<[^>]+>", " ", html_content)
|
|
763
|
+
text = html.unescape(text)
|
|
764
|
+
text_parts.append(text)
|
|
765
|
+
|
|
766
|
+
metadata = {
|
|
767
|
+
"title": book.get_metadata("DC", "title")[0][0] if book.get_metadata("DC", "title") else "", # pyright: ignore[reportAttributeAccessIssue]
|
|
768
|
+
"author": book.get_metadata("DC", "creator")[0][0] if book.get_metadata("DC", "creator") else "", # pyright: ignore[reportAttributeAccessIssue]
|
|
769
|
+
"format": "EPUB",
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
return "\n\n".join(text_parts), metadata
|
|
773
|
+
except Exception as e:
|
|
774
|
+
logger.warning(t("error_extracting_epub", error=e))
|
|
775
|
+
return "", {}
|
|
776
|
+
|
|
777
|
+
def _extract_csv(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
778
|
+
"""Extract text from CSV file."""
|
|
779
|
+
try:
|
|
780
|
+
text_parts = []
|
|
781
|
+
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
|
782
|
+
reader = csv.reader(f)
|
|
783
|
+
for row in reader:
|
|
784
|
+
# Check if stopped periodically during row processing
|
|
785
|
+
if self.stopped:
|
|
786
|
+
return "", {}
|
|
787
|
+
|
|
788
|
+
# Check if paused periodically during row processing
|
|
789
|
+
while self.paused:
|
|
790
|
+
self.paused_event.wait(0.1)
|
|
791
|
+
if self.stopped:
|
|
792
|
+
return "", {}
|
|
793
|
+
|
|
794
|
+
row_text = " | ".join(str(cell) for cell in row)
|
|
795
|
+
text_parts.append(row_text)
|
|
796
|
+
|
|
797
|
+
metadata = {
|
|
798
|
+
"format": "CSV",
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
return "\n".join(text_parts), metadata
|
|
802
|
+
except Exception as e:
|
|
803
|
+
logger.warning(t("error_extracting_csv", error=e))
|
|
804
|
+
return "", {}
|
|
805
|
+
|
|
806
|
+
def _extract_xml(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
807
|
+
"""Extract text from XML file."""
|
|
808
|
+
try:
|
|
809
|
+
tree = ET.parse(file_path)
|
|
810
|
+
root = tree.getroot()
|
|
811
|
+
|
|
812
|
+
# Extract all text content
|
|
813
|
+
text_parts = [elem.text for elem in root.iter() if elem.text and elem.text.strip()]
|
|
814
|
+
text = "\n".join(text_parts)
|
|
815
|
+
|
|
816
|
+
metadata = {
|
|
817
|
+
"format": "XML",
|
|
818
|
+
"root_tag": root.tag,
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
return text, metadata
|
|
822
|
+
except Exception as e:
|
|
823
|
+
logger.warning(t("error_extracting_xml", error=e))
|
|
824
|
+
return "", {}
|
|
825
|
+
|
|
826
|
+
def _extract_html(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
827
|
+
"""Extract text from HTML file."""
|
|
828
|
+
try:
|
|
829
|
+
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
|
830
|
+
html_content = f.read()
|
|
831
|
+
|
|
832
|
+
# Remove HTML tags
|
|
833
|
+
import re
|
|
834
|
+
|
|
835
|
+
text = re.sub(r"<[^>]+>", " ", html_content)
|
|
836
|
+
text = html.unescape(text)
|
|
837
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
838
|
+
|
|
839
|
+
metadata = {
|
|
840
|
+
"format": "HTML",
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
return text, metadata
|
|
844
|
+
except Exception as e:
|
|
845
|
+
logger.warning(t("error_extracting_html", error=e))
|
|
846
|
+
return "", {}
|
|
847
|
+
|
|
848
|
+
def _extract_markdown(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
849
|
+
"""Extract text from Markdown file."""
|
|
850
|
+
try:
|
|
851
|
+
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
|
852
|
+
content = f.read()
|
|
853
|
+
|
|
854
|
+
if markdown:
|
|
855
|
+
# Convert Markdown to HTML then extract text
|
|
856
|
+
html_content = markdown.markdown(content) # pyright: ignore[reportAttributeAccessIssue]
|
|
857
|
+
import re
|
|
858
|
+
|
|
859
|
+
text = re.sub(r"<[^>]+>", " ", html_content)
|
|
860
|
+
text = html.unescape(text)
|
|
861
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
862
|
+
else:
|
|
863
|
+
# Simple Markdown processing
|
|
864
|
+
text = content
|
|
865
|
+
|
|
866
|
+
metadata = {
|
|
867
|
+
"format": "Markdown",
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
return text, metadata
|
|
871
|
+
except Exception as e:
|
|
872
|
+
logger.warning(t("error_extracting_markdown", error=e))
|
|
873
|
+
return "", {}
|
|
874
|
+
|
|
875
|
+
def _extract_docx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
876
|
+
"""Extract text from DOCX file."""
|
|
877
|
+
if Document is None:
|
|
878
|
+
logger.warning(t("python_docx_not_installed_skipping_extraction"))
|
|
879
|
+
return "", {}
|
|
880
|
+
|
|
881
|
+
doc = Document(str(file_path))
|
|
882
|
+
text_parts = []
|
|
883
|
+
|
|
884
|
+
for paragraph in doc.paragraphs:
|
|
885
|
+
# Check if stopped periodically during paragraph processing
|
|
886
|
+
if self.stopped:
|
|
887
|
+
return "", {}
|
|
888
|
+
|
|
889
|
+
# Check if paused periodically during paragraph processing
|
|
890
|
+
while self.paused:
|
|
891
|
+
self.paused_event.wait(0.1)
|
|
892
|
+
if self.stopped:
|
|
893
|
+
return "", {}
|
|
894
|
+
|
|
895
|
+
text_parts.append(paragraph.text)
|
|
896
|
+
|
|
897
|
+
# Extract tables
|
|
898
|
+
for table in doc.tables:
|
|
899
|
+
# Check if stopped before processing each table
|
|
900
|
+
if self.stopped:
|
|
901
|
+
return "", {}
|
|
902
|
+
|
|
903
|
+
# Check if paused before processing each table
|
|
904
|
+
while self.paused:
|
|
905
|
+
self.paused_event.wait(0.1)
|
|
906
|
+
if self.stopped:
|
|
907
|
+
return "", {}
|
|
908
|
+
|
|
909
|
+
for row in table.rows:
|
|
910
|
+
row_text = " | ".join(cell.text for cell in row.cells)
|
|
911
|
+
text_parts.append(row_text)
|
|
912
|
+
|
|
913
|
+
metadata = {
|
|
914
|
+
"paragraph_count": len(doc.paragraphs),
|
|
915
|
+
"table_count": len(doc.tables),
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
return "\n".join(text_parts), metadata
|
|
919
|
+
|
|
920
|
+
def _extract_xlsx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
921
|
+
"""Extract text from XLSX file."""
|
|
922
|
+
if load_workbook is None:
|
|
923
|
+
logger.warning(t("openpyxl_not_installed_skipping_extraction"))
|
|
924
|
+
return "", {}
|
|
925
|
+
|
|
926
|
+
wb = load_workbook(file_path, read_only=True, data_only=True)
|
|
927
|
+
text_parts = []
|
|
928
|
+
|
|
929
|
+
for sheet_name in wb.sheetnames:
|
|
930
|
+
# Check if stopped before processing each sheet
|
|
931
|
+
if self.stopped:
|
|
932
|
+
wb.close()
|
|
933
|
+
return "", {}
|
|
934
|
+
|
|
935
|
+
# Check if paused before processing each sheet
|
|
936
|
+
while self.paused:
|
|
937
|
+
self.paused_event.wait(0.1)
|
|
938
|
+
if self.stopped:
|
|
939
|
+
wb.close()
|
|
940
|
+
return "", {}
|
|
941
|
+
|
|
942
|
+
sheet = wb[sheet_name]
|
|
943
|
+
text_parts.append(f"[Sheet: {sheet_name}]")
|
|
944
|
+
for row in sheet.iter_rows(values_only=True):
|
|
945
|
+
# Check if stopped periodically during row processing
|
|
946
|
+
if self.stopped:
|
|
947
|
+
wb.close()
|
|
948
|
+
return "", {}
|
|
949
|
+
|
|
950
|
+
# Check if paused periodically during row processing
|
|
951
|
+
while self.paused:
|
|
952
|
+
self.paused_event.wait(0.1)
|
|
953
|
+
if self.stopped:
|
|
954
|
+
wb.close()
|
|
955
|
+
return "", {}
|
|
956
|
+
|
|
957
|
+
row_text = " | ".join(str(cell) if cell is not None else "" for cell in row)
|
|
958
|
+
if row_text.strip():
|
|
959
|
+
text_parts.append(row_text)
|
|
960
|
+
|
|
961
|
+
metadata = {
|
|
962
|
+
"sheet_count": len(wb.sheetnames),
|
|
963
|
+
"sheets": wb.sheetnames,
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
wb.close()
|
|
967
|
+
return "\n".join(text_parts), metadata
|
|
968
|
+
|
|
969
|
+
def _extract_pptx(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
970
|
+
"""Extract text from PPTX file."""
|
|
971
|
+
try:
|
|
972
|
+
from pptx import Presentation
|
|
973
|
+
except ImportError:
|
|
974
|
+
logger.warning(t("python_pptx_not_installed_skipping_extraction"))
|
|
975
|
+
return "", {}
|
|
976
|
+
|
|
977
|
+
prs = Presentation(str(file_path))
|
|
978
|
+
text_parts = []
|
|
979
|
+
|
|
980
|
+
for slide_num, slide in enumerate(prs.slides, 1):
|
|
981
|
+
# Check if stopped before processing each slide
|
|
982
|
+
if self.stopped:
|
|
983
|
+
return "", {}
|
|
984
|
+
|
|
985
|
+
# Check if paused before processing each slide
|
|
986
|
+
while self.paused:
|
|
987
|
+
self.paused_event.wait(0.1)
|
|
988
|
+
if self.stopped:
|
|
989
|
+
return "", {}
|
|
990
|
+
|
|
991
|
+
text_parts.append(f"[Slide {slide_num}]")
|
|
992
|
+
for shape in slide.shapes:
|
|
993
|
+
if hasattr(shape, "text"):
|
|
994
|
+
text_parts.append(shape.text) # pyright: ignore[reportAttributeAccessIssue]
|
|
995
|
+
|
|
996
|
+
metadata = {
|
|
997
|
+
"slide_count": len(prs.slides),
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
return "\n".join(text_parts), metadata
|
|
1001
|
+
|
|
1002
|
+
def _extract_image(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
1003
|
+
"""Extract text from image file using OCR."""
|
|
1004
|
+
if Image is None or pytesseract is None:
|
|
1005
|
+
logger.warning(t("pillow_or_tesseract_not_installed_skipping_ocr"))
|
|
1006
|
+
return "", {}
|
|
1007
|
+
|
|
1008
|
+
try:
|
|
1009
|
+
img = Image.open(file_path)
|
|
1010
|
+
text = pytesseract.image_to_string(img)
|
|
1011
|
+
|
|
1012
|
+
metadata = {
|
|
1013
|
+
"format": img.format,
|
|
1014
|
+
"mode": img.mode,
|
|
1015
|
+
"size": img.size,
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
return text, metadata
|
|
1019
|
+
except Exception as e:
|
|
1020
|
+
logger.warning(t("could_not_perform_ocr_on_file", file_path=file_path, error=e))
|
|
1021
|
+
return "", {}
|
|
1022
|
+
|
|
1023
|
+
def _extract_text(self, file_path: Path) -> tuple[str, dict[str, Any]]:
|
|
1024
|
+
"""Extract text from plain text file."""
|
|
1025
|
+
encodings = ["utf-8", "latin-1", "cp1252", "utf-16"]
|
|
1026
|
+
|
|
1027
|
+
for encoding in encodings:
|
|
1028
|
+
try:
|
|
1029
|
+
with open(file_path, encoding=encoding, errors="ignore") as f:
|
|
1030
|
+
text = f.read()
|
|
1031
|
+
return text, {"encoding": encoding}
|
|
1032
|
+
except UnicodeDecodeError:
|
|
1033
|
+
continue
|
|
1034
|
+
|
|
1035
|
+
return "", {}
|
|
1036
|
+
|
|
1037
|
+
|
|
1038
|
+
def main():
|
|
1039
|
+
"""Main entry point for document scanner."""
|
|
1040
|
+
# 首先解析语言参数,但不使用翻译
|
|
1041
|
+
temp_parser = argparse.ArgumentParser(add_help=False)
|
|
1042
|
+
temp_parser.add_argument("--lang", choices=["en", "zh"], default="zh")
|
|
1043
|
+
temp_args, _ = temp_parser.parse_known_args()
|
|
1044
|
+
|
|
1045
|
+
# 设置语言
|
|
1046
|
+
global USE_CHINESE
|
|
1047
|
+
USE_CHINESE = temp_args.lang == "zh"
|
|
1048
|
+
|
|
1049
|
+
parser = argparse.ArgumentParser(description=t("document_scanner_description"))
|
|
1050
|
+
parser.add_argument("input", type=str, nargs="?", default=str(cwd), help=t("input_directory_help"))
|
|
1051
|
+
parser.add_argument("-r", "--rules", type=str, default="rules.json", help=t("rules_file_help"))
|
|
1052
|
+
parser.add_argument("--recursive", action="store_true", help=t("recursive_help"))
|
|
1053
|
+
parser.add_argument(
|
|
1054
|
+
"-f",
|
|
1055
|
+
"--file-types",
|
|
1056
|
+
help=t("file_types_help"),
|
|
1057
|
+
default="pdf,docx,xlsx,pptx,txt,odt,rtf,epub,csv,xml,html,md,jpg,jpeg,png,gif,bmp,tiff",
|
|
1058
|
+
)
|
|
1059
|
+
parser.add_argument("--use-pdf-ocr", help=t("use_pdf_ocr_help"), action="store_true")
|
|
1060
|
+
parser.add_argument(
|
|
1061
|
+
"--use-process-pool",
|
|
1062
|
+
help=t("use_process_pool_help"),
|
|
1063
|
+
action="store_true",
|
|
1064
|
+
)
|
|
1065
|
+
parser.add_argument(
|
|
1066
|
+
"-b",
|
|
1067
|
+
"--batch-size",
|
|
1068
|
+
help=t("batch_size_help"),
|
|
1069
|
+
default=50,
|
|
1070
|
+
type=int,
|
|
1071
|
+
)
|
|
1072
|
+
parser.add_argument("-t", "--threads", help=t("threads_help"), default=4, type=int)
|
|
1073
|
+
parser.add_argument("--progress", help=t("progress_help"), action="store_true")
|
|
1074
|
+
parser.add_argument("-v", "--verbose", help=t("verbose_help"), action="store_true")
|
|
1075
|
+
|
|
1076
|
+
# 添加语言参数
|
|
1077
|
+
parser.add_argument("--lang", help=t("language_help"), choices=["en", "zh"], default="zh")
|
|
1078
|
+
|
|
1079
|
+
args = parser.parse_args()
|
|
1080
|
+
|
|
1081
|
+
# 再次确认语言设置(以防万一用户在完整参数中改变了语言)
|
|
1082
|
+
USE_CHINESE = args.lang == "zh"
|
|
1083
|
+
|
|
1084
|
+
if args.verbose:
|
|
1085
|
+
logger.setLevel(logging.DEBUG)
|
|
1086
|
+
|
|
1087
|
+
t0 = time.perf_counter()
|
|
1088
|
+
# Validate input directory
|
|
1089
|
+
input_dir = Path(args.input)
|
|
1090
|
+
if not input_dir.exists() or not input_dir.is_dir():
|
|
1091
|
+
logger.error(t("input_directory_does_not_exist", input_dir=args.input))
|
|
1092
|
+
return
|
|
1093
|
+
logger.info(t("scanning_directory", directory=str(input_dir)))
|
|
1094
|
+
|
|
1095
|
+
# Load rules file
|
|
1096
|
+
rules_file = Path(args.rules)
|
|
1097
|
+
if not rules_file.exists() or not rules_file.is_file():
|
|
1098
|
+
rule_files_in_input_dir = list(input_dir.glob("rules*.json"))
|
|
1099
|
+
|
|
1100
|
+
if rule_files_in_input_dir:
|
|
1101
|
+
rules_file = rule_files_in_input_dir[0]
|
|
1102
|
+
else:
|
|
1103
|
+
logger.error(t("rules_file_does_not_exist_alt", rules_file=args.rules))
|
|
1104
|
+
return
|
|
1105
|
+
logger.info(t("using_rules_file", rules_file=str(rules_file)))
|
|
1106
|
+
|
|
1107
|
+
try:
|
|
1108
|
+
with open(rules_file, encoding="utf-8") as f:
|
|
1109
|
+
rules_data = json.load(f)
|
|
1110
|
+
except json.JSONDecodeError as e:
|
|
1111
|
+
logger.error(t("invalid_json_in_rules_file", error=e))
|
|
1112
|
+
return
|
|
1113
|
+
|
|
1114
|
+
# Parse rules
|
|
1115
|
+
rules = []
|
|
1116
|
+
if isinstance(rules_data, list):
|
|
1117
|
+
rules = [Rule(rule) for rule in rules_data]
|
|
1118
|
+
elif isinstance(rules_data, dict) and "rules" in rules_data:
|
|
1119
|
+
rules = [Rule(rule) for rule in rules_data["rules"]]
|
|
1120
|
+
else:
|
|
1121
|
+
logger.error(t("invalid_rules_format"))
|
|
1122
|
+
return
|
|
1123
|
+
|
|
1124
|
+
if not rules:
|
|
1125
|
+
logger.error(t("no_valid_rules_found"))
|
|
1126
|
+
return
|
|
1127
|
+
|
|
1128
|
+
# Parse file types
|
|
1129
|
+
file_types = [ft.strip() for ft in args.file_types.split(",")]
|
|
1130
|
+
|
|
1131
|
+
# Create scanner and run scan
|
|
1132
|
+
scanner = DocumentScanner(input_dir, rules, file_types, args.use_pdf_ocr, args.use_process_pool, args.batch_size)
|
|
1133
|
+
results = scanner.scan(threads=args.threads, show_progress=args.progress)
|
|
1134
|
+
|
|
1135
|
+
# Save results to JSON file in input directory
|
|
1136
|
+
output_file = input_dir / f"scan_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
|
|
1137
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
1138
|
+
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
1139
|
+
|
|
1140
|
+
logger.info(t("results_saved_to", path=str(output_file)))
|
|
1141
|
+
logger.info(t("total_time_elapsed", time=round(time.perf_counter() - t0, 2)))
|
|
1142
|
+
|
|
1143
|
+
|
|
1144
|
+
if __name__ == "__main__":
|
|
1145
|
+
main()
|