Semapp 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,513 @@
1
+ """
2
+ Module for detecting numbers in TIFF images using OCR (Tesseract).
3
+ """
4
+
5
+ import cv2
6
+ import pytesseract
7
+ from PIL import Image
8
+ import os
9
+ import numpy as np
10
+ import pandas as pd
11
+ import time
12
+ from multiprocessing import Pool, cpu_count
13
+ from functools import partial
14
+
15
+ # Configuration: Number of CPU cores to use for multiprocessing
16
+ # Set to None to use all available cores, or specify a number (e.g., 4 for 4 cores)
17
+ N_CPU = 8 # None = use all cores, or set to specific number like 4, 8, etc.
18
+
19
+
20
+ class Detection:
21
+ """
22
+ A class to handle number detection in TIFF images using OCR.
23
+ """
24
+
25
+ def __init__(self, dirname, roi=None):
26
+ """
27
+ Initialize the detection instance with necessary parameters.
28
+
29
+ Args:
30
+ dirname (str): The base directory for the files.
31
+ roi (tuple): Optional tuple (x, y, w, h) defining the region of interest.
32
+ Default is (1100, 0, 250, 35).
33
+ """
34
+ self.dirname = dirname
35
+ self.roi = roi if roi is not None else (1100, 0, 250, 35)
36
+ self._setup_tesseract()
37
+
38
+ def _setup_tesseract(self):
39
+ """
40
+ Configure Tesseract OCR path automatically on Windows.
41
+
42
+ Searches common installation paths and sets pytesseract.tesseract_cmd.
43
+ Raises RuntimeError if Tesseract is not found.
44
+
45
+ Returns:
46
+ bool: True if Tesseract is configured successfully
47
+ """
48
+ import sys
49
+ if sys.platform == 'win32':
50
+ # Common paths where Tesseract might be installed
51
+ possible_paths = [
52
+ r"C:\Program Files\Tesseract-OCR\tesseract.exe",
53
+ r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
54
+ r"C:\Users\{}\AppData\Local\Programs\Tesseract-OCR\tesseract.exe".format(os.getenv('USERNAME', '')),
55
+ ]
56
+
57
+ # Check if Tesseract is already in PATH
58
+ try:
59
+ pytesseract.get_tesseract_version()
60
+ return True
61
+ except:
62
+ pass
63
+
64
+ # Try possible paths
65
+ for path in possible_paths:
66
+ if os.path.exists(path):
67
+ pytesseract.pytesseract.tesseract_cmd = path
68
+ return True
69
+
70
+ raise RuntimeError(
71
+ "Tesseract not found. Please install Tesseract OCR:\n"
72
+ "1. Download from: https://github.com/UB-Mannheim/tesseract/wiki\n"
73
+ "2. Install in default path: C:\\Program Files\\Tesseract-OCR\\"
74
+ )
75
+ return True
76
+
77
+ def detect_number_on_image(self, img_array, page_num=None, show_detection=False, resize_factor=1.0):
78
+ """
79
+ Detect numbers on an image (numpy array).
80
+
81
+ Args:
82
+ img_array: Image as numpy array
83
+ page_num: Page number (optional, for display)
84
+ show_detection: If True, also returns annotated image with detected zones
85
+ resize_factor: Factor to resize image before OCR (1.0 = no resize, 0.5 = half size for speed)
86
+
87
+ Returns:
88
+ If show_detection=False: Detected numbers or None
89
+ If show_detection=True: Tuple (detected_numbers, annotated_image)
90
+ """
91
+ try:
92
+ # Verify Tesseract is available
93
+ pytesseract.get_tesseract_version()
94
+ except Exception as e:
95
+ raise Exception(f"Tesseract not available: {e}")
96
+
97
+ # Extract ROI if specified
98
+ if self.roi is not None:
99
+ x_roi, y_roi, w_roi, h_roi = self.roi
100
+ # Ensure ROI is within image bounds
101
+ height, width = img_array.shape[:2]
102
+ x_roi = max(0, min(x_roi, width - 1))
103
+ y_roi = max(0, min(y_roi, height - 1))
104
+ w_roi = min(w_roi, width - x_roi)
105
+ h_roi = min(h_roi, height - y_roi)
106
+
107
+ # Extract ROI
108
+ roi_img = img_array[y_roi:y_roi+h_roi, x_roi:x_roi+w_roi]
109
+ else:
110
+ roi_img = img_array
111
+ x_roi, y_roi = 0, 0
112
+
113
+ # Resize for faster processing (if resize_factor < 1.0)
114
+ if resize_factor < 1.0 and resize_factor > 0:
115
+ new_w = int(roi_img.shape[1] * resize_factor)
116
+ new_h = int(roi_img.shape[0] * resize_factor)
117
+ roi_img = cv2.resize(roi_img, (new_w, new_h), interpolation=cv2.INTER_AREA)
118
+ # Adjust ROI coordinates for display
119
+ scale_x = 1.0 / resize_factor
120
+ scale_y = 1.0 / resize_factor
121
+ else:
122
+ scale_x = 1.0
123
+ scale_y = 1.0
124
+
125
+ # Convert to grayscale if necessary
126
+ if len(roi_img.shape) == 3:
127
+ gray = cv2.cvtColor(roi_img, cv2.COLOR_BGR2GRAY)
128
+ # Keep color copy for display
129
+ display_img = img_array.copy() if show_detection else None
130
+ else:
131
+ gray = roi_img
132
+ display_img = cv2.cvtColor(img_array, cv2.COLOR_GRAY2BGR) if show_detection else None
133
+
134
+ # Filter to reduce noise (reduced kernel for speed)
135
+ gray = cv2.medianBlur(gray, 3)
136
+
137
+ # Binarization (black and white) to improve text
138
+ _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
139
+
140
+ # Tesseract configuration: optimized for speed
141
+ # PSM 8 (single word) is faster than PSM 6 (single uniform block)
142
+ custom_config = r'--oem 3 --psm 8 -c tessedit_char_whitelist=0123456789'
143
+
144
+ # Convert to PIL Image for pytesseract
145
+ pil_image = Image.fromarray(thresh)
146
+
147
+ # Detection with pytesseract to get bounding boxes
148
+ try:
149
+ # Get detailed data (bounding boxes)
150
+ data = pytesseract.image_to_data(pil_image, config=custom_config, output_type=pytesseract.Output.DICT)
151
+
152
+ # Read text
153
+ text = pytesseract.image_to_string(pil_image, config=custom_config)
154
+
155
+ # Clean result: keep only digits
156
+ digits_only = "".join(ch for ch in text if ch.isdigit())
157
+
158
+ # Draw rectangles around detected zones
159
+ if show_detection:
160
+ n_boxes = len(data['text'])
161
+ boxes_drawn = 0
162
+
163
+ for i in range(n_boxes):
164
+ conf = int(data['conf'][i]) if data['conf'][i] != '' else -1
165
+ (x, y, w, h) = (data['left'][i], data['top'][i], data['width'][i], data['height'][i])
166
+ detected_text = data['text'][i].strip()
167
+
168
+ # Draw all zones with confidence > 0
169
+ if conf > 0 and w > 0 and h > 0:
170
+ # Adjust coordinates if working on ROI and resized
171
+ x_scaled = int(x * scale_x)
172
+ y_scaled = int(y * scale_y)
173
+ w_scaled = int(w * scale_x)
174
+ h_scaled = int(h * scale_y)
175
+ x_abs = x_scaled + x_roi
176
+ y_abs = y_scaled + y_roi
177
+
178
+ # Check if it's a digit
179
+ if detected_text and detected_text.isdigit():
180
+ if display_img is not None:
181
+ # Green rectangle for detected digits (on full image)
182
+ cv2.rectangle(display_img, (x_abs, y_abs), (x_abs + w_scaled, y_abs + h_scaled), (0, 255, 0), 3)
183
+ cv2.putText(display_img, detected_text, (x_abs, max(y_abs - 5, 10)),
184
+ cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
185
+ boxes_drawn += 1
186
+
187
+ if display_img is not None:
188
+ # Draw ROI rectangle for visualization
189
+ if self.roi is not None:
190
+ cv2.rectangle(display_img, (x_roi, y_roi), (x_roi + w_roi, y_roi + h_roi), (255, 255, 0), 2)
191
+ cv2.putText(display_img, "ROI", (x_roi, y_roi - 10),
192
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 0), 2)
193
+
194
+ except Exception as e:
195
+ # Fallback: simple method without bounding boxes
196
+ text = pytesseract.image_to_string(pil_image, config=custom_config)
197
+ digits_only = "".join(ch for ch in text if ch.isdigit())
198
+
199
+ if digits_only:
200
+ if show_detection:
201
+ return digits_only, display_img
202
+ return digits_only
203
+ else:
204
+ if show_detection:
205
+ return None, display_img
206
+ return None
207
+
208
+ @staticmethod
209
+ def _process_single_page_static(args):
210
+ """
211
+ Static helper method to process a single page (for multiprocessing).
212
+
213
+ Args:
214
+ args: Tuple of (page_num, img_array, roi, resize_factor, dirname)
215
+
216
+ Returns:
217
+ Tuple (page_num, detected_number)
218
+ """
219
+ page_num, img_array, roi, resize_factor, dirname = args
220
+ try:
221
+ # Create temporary detector instance for this worker
222
+ temp_detector = Detection(dirname=dirname, roi=roi)
223
+ detected_number = temp_detector.detect_number_on_image(
224
+ img_array, page_num, show_detection=False, resize_factor=resize_factor
225
+ )
226
+ return (page_num, detected_number)
227
+ except Exception as e:
228
+ return (page_num, None)
229
+
230
+ def detect_numbers_in_tiff(self, tiff_path, verbose=True, use_multiprocessing=True, resize_factor=1.0):
231
+ """
232
+ Process a multi-page TIFF file and detect numbers on each page.
233
+
234
+ Args:
235
+ tiff_path: Path to the TIFF file
236
+ verbose: If True, print progress messages
237
+ use_multiprocessing: If True, use parallel processing (faster for many pages)
238
+ resize_factor: Factor to resize ROI before OCR (1.0 = no resize, 0.5 = half size for speed)
239
+
240
+ Returns:
241
+ List of tuples (page_num, detected_number)
242
+ """
243
+ if not os.path.exists(tiff_path):
244
+ return []
245
+
246
+ results = []
247
+
248
+ try:
249
+ # Open TIFF file with PIL
250
+ img = Image.open(tiff_path)
251
+
252
+ # Count number of pages
253
+ num_pages = img.n_frames if hasattr(img, 'n_frames') else 1
254
+
255
+ if verbose:
256
+ print(f"Found {num_pages} page(s) in TIFF file")
257
+ if use_multiprocessing and num_pages > 1:
258
+ print(f"Using multiprocessing with {min(cpu_count(), num_pages)} cores")
259
+ if resize_factor < 1.0:
260
+ print(f"Resizing ROI by factor {resize_factor} for speed")
261
+ print("Processing pages...")
262
+
263
+ # Load all pages into memory
264
+ pages_data = []
265
+ for page_num in range(num_pages):
266
+ try:
267
+ img.seek(page_num)
268
+ img_array = np.array(img)
269
+
270
+ # Convert RGBA to RGB if necessary
271
+ if len(img_array.shape) == 3 and img_array.shape[2] == 4:
272
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGBA2RGB)
273
+
274
+ pages_data.append((page_num + 1, img_array))
275
+ except EOFError:
276
+ break
277
+
278
+ img.close()
279
+
280
+ # Process pages
281
+ if use_multiprocessing and len(pages_data) > 1:
282
+ # Use multiprocessing for parallel processing
283
+ if N_CPU is None:
284
+ num_workers = min(cpu_count(), len(pages_data))
285
+ else:
286
+ num_workers = min(N_CPU, len(pages_data))
287
+
288
+ if verbose:
289
+ print(f"Using {num_workers} CPU core(s) for parallel processing")
290
+
291
+ process_args = [(page_num, img_array, self.roi, resize_factor, self.dirname)
292
+ for page_num, img_array in pages_data]
293
+
294
+ start_time = time.time()
295
+ with Pool(processes=num_workers) as pool:
296
+ results = pool.map(Detection._process_single_page_static, process_args)
297
+ processing_time = time.time() - start_time
298
+
299
+ if verbose:
300
+ print(f"Processing time: {processing_time:.2f} seconds ({processing_time/len(results):.3f} sec/page)")
301
+ else:
302
+ # Sequential processing
303
+ start_time = time.time()
304
+ for page_num, img_array in pages_data:
305
+ if verbose and page_num % 10 == 1:
306
+ print(f" Processing page {page_num}/{num_pages}...", end='\r')
307
+
308
+ detected_number = self.detect_number_on_image(
309
+ img_array, page_num, show_detection=False, resize_factor=resize_factor
310
+ )
311
+ results.append((page_num, detected_number))
312
+ processing_time = time.time() - start_time
313
+
314
+ if verbose:
315
+ print(f"\nProcessing time: {processing_time:.2f} seconds ({processing_time/len(results):.3f} sec/page)")
316
+
317
+ # Sort results by page number (important for multiprocessing)
318
+ results.sort(key=lambda x: x[0])
319
+
320
+ if verbose:
321
+ print(f"\nCompleted processing {len(results)} page(s)")
322
+
323
+ except Exception as e:
324
+ raise RuntimeError(f"Error opening TIFF file: {e}")
325
+
326
+ return results
327
+
328
+ def detect_numbers_in_directory(self, wafer_number=None, skip_if_csv_exists=True):
329
+ """
330
+ Detect numbers in all TIFF files in the directory or specific wafer folder.
331
+
332
+ Args:
333
+ wafer_number (str, optional): Specific wafer number to process.
334
+ If None, processes all TIFF files in dirname.
335
+ skip_if_csv_exists (bool): If True, skip directories that already have detection_results.csv
336
+
337
+ Returns:
338
+ Dictionary mapping file paths to detection results
339
+ """
340
+ results = {}
341
+
342
+ if wafer_number:
343
+ search_dir = os.path.join(self.dirname, str(wafer_number))
344
+ else:
345
+ search_dir = self.dirname
346
+
347
+ if not os.path.exists(search_dir):
348
+ return results
349
+
350
+ # Check if CSV already exists and skip if requested
351
+ if skip_if_csv_exists:
352
+ csv_path = os.path.join(search_dir, "detection_results.csv")
353
+ if os.path.exists(csv_path):
354
+ return results # Already processed, skip
355
+
356
+ # Find all TIFF files
357
+ tiff_files = []
358
+ for root, dirs, files in os.walk(search_dir):
359
+ for file in files:
360
+ if file.lower().endswith(('.tif', '.tiff')):
361
+ tiff_files.append(os.path.join(root, file))
362
+
363
+ # Process each TIFF file
364
+ for tiff_path in tiff_files:
365
+ try:
366
+ file_results = self.detect_numbers_in_tiff(tiff_path, verbose=False)
367
+ results[tiff_path] = file_results
368
+ except Exception as e:
369
+ results[tiff_path] = f"Error: {e}"
370
+
371
+ return results
372
+
373
+ def save_results_to_csv(self, results, output_path=None):
374
+ """
375
+ Save detection results to a CSV file.
376
+
377
+ Args:
378
+ results: Dictionary from detect_numbers_in_directory or list from detect_numbers_in_tiff
379
+ output_path: Path to output CSV file. If None, saves in dirname.
380
+
381
+ Returns:
382
+ Path to saved CSV file
383
+ """
384
+ if output_path is None:
385
+ output_path = os.path.join(self.dirname, "detection_results.csv")
386
+
387
+ rows = []
388
+
389
+ # Handle different result formats
390
+ if isinstance(results, dict):
391
+ # Results from detect_numbers_in_directory
392
+ for file_path, file_results in results.items():
393
+ if isinstance(file_results, list):
394
+ for page_num, detected_number in file_results:
395
+ rows.append({
396
+ 'File': os.path.basename(file_path),
397
+ 'Path': file_path,
398
+ 'Page': page_num,
399
+ 'Detected_Number': detected_number if detected_number else 'None'
400
+ })
401
+ else:
402
+ rows.append({
403
+ 'File': os.path.basename(file_path),
404
+ 'Path': file_path,
405
+ 'Page': 'N/A',
406
+ 'Detected_Number': str(file_results)
407
+ })
408
+ elif isinstance(results, list):
409
+ # Results from detect_numbers_in_tiff
410
+ for page_num, detected_number in results:
411
+ rows.append({
412
+ 'File': 'N/A',
413
+ 'Path': 'N/A',
414
+ 'Page': page_num,
415
+ 'Detected_Number': detected_number if detected_number else 'None'
416
+ })
417
+
418
+ # Create DataFrame and save
419
+ df = pd.DataFrame(rows)
420
+ df.to_csv(output_path, index=False)
421
+
422
+ return output_path
423
+
424
+
425
+ if __name__ == "__main__":
426
+ # Example usage for testing
427
+
428
+ # Example 1: Test on a single TIFF file
429
+ # Uncomment and modify the path to test
430
+ tiff_file = r"C:\Users\TM273821\Desktop\SEM\Detection\1\AsGa_FAV_2X_WIW_200_2X_REVIEW_03151011.tif"
431
+
432
+ print("=" * 60)
433
+ print("DETECTION TEST")
434
+ print("=" * 60)
435
+ print(f"File: {tiff_file}")
436
+ print(f"File exists: {os.path.exists(tiff_file)}")
437
+
438
+ if not os.path.exists(tiff_file):
439
+ print(f"ERROR: File not found: {tiff_file}")
440
+ exit(1)
441
+
442
+ try:
443
+ print("\nInitializing detector...")
444
+ detector = Detection(dirname=os.path.dirname(tiff_file))
445
+ print(f"ROI: {detector.roi}")
446
+ print(f"CPU cores available: {cpu_count()}")
447
+ print(f"CPU cores to use: {N_CPU if N_CPU is not None else 'All (' + str(cpu_count()) + ')'}")
448
+
449
+ print("\nProcessing TIFF file...")
450
+ start_total = time.time()
451
+
452
+ # Use multiprocessing and resize for speed
453
+ results = detector.detect_numbers_in_tiff(
454
+ tiff_file,
455
+ use_multiprocessing=True,
456
+ resize_factor=1.0 # Set to 0.5 for even faster processing (may reduce accuracy)
457
+ )
458
+
459
+ total_time = time.time() - start_total
460
+
461
+ print(f"\n{'='*60}")
462
+ print(f"TOTAL TIME: {total_time:.2f} seconds")
463
+ if len(results) > 0:
464
+ print(f"Time per page: {total_time/len(results):.3f} seconds")
465
+ print(f"{'='*60}")
466
+
467
+ print(f"\nTotal pages processed: {len(results)}")
468
+ print("\nResults:")
469
+ print("-" * 60)
470
+ for page_num, detected_number in results:
471
+ if detected_number:
472
+ print(f"Page {page_num}: {detected_number}")
473
+ else:
474
+ print(f"Page {page_num}: No number detected")
475
+
476
+ # Save results
477
+ output_csv = os.path.join(os.path.dirname(tiff_file), "detection_results.csv")
478
+ print(f"\nSaving results to: {output_csv}")
479
+ detector.save_results_to_csv(results, output_csv)
480
+ print("Results saved successfully!")
481
+
482
+ except Exception as e:
483
+ print(f"\nERROR: {e}")
484
+ import traceback
485
+ traceback.print_exc()
486
+
487
+ # Example 2: Test on a directory
488
+ # Uncomment and modify the path to test
489
+ # dirname = r"C:\Users\TM273821\Desktop\SEM\Detection"
490
+ # detector = Detection(dirname=dirname)
491
+ # results = detector.detect_numbers_in_directory()
492
+ # detector.save_results_to_csv(results)
493
+
494
+ # Example 3: Test on a specific wafer folder
495
+ # Uncomment and modify the paths to test
496
+ # dirname = r"C:\Users\TM273821\Desktop\SEM\Detection"
497
+ # wafer_number = "1"
498
+ # detector = Detection(dirname=dirname)
499
+ # results = detector.detect_numbers_in_directory(wafer_number=wafer_number)
500
+ # output_path = os.path.join(dirname, wafer_number, "detection_results.csv")
501
+ # detector.save_results_to_csv(results, output_path)
502
+
503
+ # Example 4: Test with custom ROI
504
+ # Uncomment and modify the path to test
505
+ # tiff_file = r"C:\Users\TM273821\Desktop\SEM\Detection\AsGa_FAV_2X_WIW_200_2X_REVIEW_03151011.tif"
506
+ # custom_roi = (1100, 0, 250, 35) # x, y, width, height
507
+ # detector = Detection(dirname=os.path.dirname(tiff_file), roi=custom_roi)
508
+ # results = detector.detect_numbers_in_tiff(tiff_file)
509
+ # print("\nResults with custom ROI:")
510
+ # for page_num, detected_number in results:
511
+ # print(f"Page {page_num}: {detected_number}")
512
+
513
+ # print("Detection module loaded. Uncomment examples in __main__ to test.")