visual-rag-toolkit 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. benchmarks/README.md +101 -0
  2. benchmarks/__init__.py +11 -0
  3. benchmarks/analyze_results.py +187 -0
  4. benchmarks/benchmark_datasets.txt +105 -0
  5. benchmarks/prepare_submission.py +205 -0
  6. benchmarks/quick_test.py +566 -0
  7. benchmarks/run_vidore.py +513 -0
  8. benchmarks/vidore_beir_qdrant/run_qdrant_beir.py +1365 -0
  9. benchmarks/vidore_tatdqa_test/COMMANDS.md +83 -0
  10. benchmarks/vidore_tatdqa_test/__init__.py +6 -0
  11. benchmarks/vidore_tatdqa_test/dataset_loader.py +363 -0
  12. benchmarks/vidore_tatdqa_test/metrics.py +44 -0
  13. benchmarks/vidore_tatdqa_test/run_qdrant.py +799 -0
  14. benchmarks/vidore_tatdqa_test/sweep_eval.py +372 -0
  15. demo/__init__.py +10 -0
  16. demo/app.py +45 -0
  17. demo/commands.py +334 -0
  18. demo/config.py +34 -0
  19. demo/download_models.py +75 -0
  20. demo/evaluation.py +602 -0
  21. demo/example_metadata_mapping_sigir.json +37 -0
  22. demo/indexing.py +286 -0
  23. demo/qdrant_utils.py +211 -0
  24. demo/results.py +35 -0
  25. demo/test_qdrant_connection.py +119 -0
  26. demo/ui/__init__.py +15 -0
  27. demo/ui/benchmark.py +355 -0
  28. demo/ui/header.py +30 -0
  29. demo/ui/playground.py +339 -0
  30. demo/ui/sidebar.py +162 -0
  31. demo/ui/upload.py +487 -0
  32. visual_rag/__init__.py +98 -0
  33. visual_rag/cli/__init__.py +1 -0
  34. visual_rag/cli/main.py +629 -0
  35. visual_rag/config.py +230 -0
  36. visual_rag/demo_runner.py +90 -0
  37. visual_rag/embedding/__init__.py +26 -0
  38. visual_rag/embedding/pooling.py +343 -0
  39. visual_rag/embedding/visual_embedder.py +622 -0
  40. visual_rag/indexing/__init__.py +21 -0
  41. visual_rag/indexing/cloudinary_uploader.py +274 -0
  42. visual_rag/indexing/pdf_processor.py +324 -0
  43. visual_rag/indexing/pipeline.py +628 -0
  44. visual_rag/indexing/qdrant_indexer.py +478 -0
  45. visual_rag/preprocessing/__init__.py +3 -0
  46. visual_rag/preprocessing/crop_empty.py +120 -0
  47. visual_rag/qdrant_admin.py +222 -0
  48. visual_rag/retrieval/__init__.py +19 -0
  49. visual_rag/retrieval/multi_vector.py +222 -0
  50. visual_rag/retrieval/single_stage.py +126 -0
  51. visual_rag/retrieval/three_stage.py +173 -0
  52. visual_rag/retrieval/two_stage.py +471 -0
  53. visual_rag/visualization/__init__.py +19 -0
  54. visual_rag/visualization/saliency.py +335 -0
  55. visual_rag_toolkit-0.1.1.dist-info/METADATA +305 -0
  56. visual_rag_toolkit-0.1.1.dist-info/RECORD +59 -0
  57. visual_rag_toolkit-0.1.1.dist-info/WHEEL +4 -0
  58. visual_rag_toolkit-0.1.1.dist-info/entry_points.txt +3 -0
  59. visual_rag_toolkit-0.1.1.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,274 @@
1
+ """
2
+ Cloudinary Uploader - Upload images to Cloudinary CDN.
3
+
4
+ Works INDEPENDENTLY of PDF processing and embedding.
5
+ Use it if you just need to upload images to a CDN.
6
+
7
+ Features:
8
+ - Retry logic with timeouts
9
+ - Batch uploading
10
+ - Automatic JPEG optimization
11
+
12
+ Environment Variables:
13
+ - VISUAL_RAG_THREAD_SAFE: Set to "1" to use thread-safe timeouts
14
+ (required for Streamlit, Flask, or other threaded contexts)
15
+ """
16
+
17
+ import io
18
+ import logging
19
+ import os
20
+ import platform
21
+ import signal
22
+ import threading
23
+ import time
24
+ from concurrent.futures import ThreadPoolExecutor
25
+ from concurrent.futures import TimeoutError as FuturesTimeoutError
26
+ from typing import Optional
27
+
28
+ from PIL import Image
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ THREAD_SAFE_MODE = os.getenv("VISUAL_RAG_THREAD_SAFE", "").lower() in ("1", "true", "yes")
33
+
34
+
35
+ class CloudinaryUploader:
36
+ """
37
+ Upload images to Cloudinary CDN.
38
+
39
+ Works independently - just needs PIL images.
40
+
41
+ Args:
42
+ cloud_name: Cloudinary cloud name
43
+ api_key: Cloudinary API key
44
+ api_secret: Cloudinary API secret
45
+ folder: Base folder for uploads
46
+ max_retries: Number of retry attempts
47
+ timeout_seconds: Timeout per upload
48
+
49
+ Example:
50
+ >>> uploader = CloudinaryUploader(
51
+ ... cloud_name="my-cloud",
52
+ ... api_key="xxx",
53
+ ... api_secret="yyy",
54
+ ... folder="my-project",
55
+ ... )
56
+ >>>
57
+ >>> url = uploader.upload(image, "doc_page_1")
58
+ >>> print(url) # https://res.cloudinary.com/.../doc_page_1.jpg
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ cloud_name: Optional[str] = None,
64
+ api_key: Optional[str] = None,
65
+ api_secret: Optional[str] = None,
66
+ folder: str = "visual-rag",
67
+ max_retries: int = 3,
68
+ timeout_seconds: int = 30,
69
+ jpeg_quality: int = 95,
70
+ ):
71
+ # Load from environment if not provided
72
+ self.cloud_name = cloud_name or os.getenv("CLOUDINARY_CLOUD_NAME")
73
+ self.api_key = api_key or os.getenv("CLOUDINARY_API_KEY")
74
+ self.api_secret = api_secret or os.getenv("CLOUDINARY_API_SECRET")
75
+
76
+ if not all([self.cloud_name, self.api_key, self.api_secret]):
77
+ raise ValueError(
78
+ "Cloudinary credentials required. Set CLOUDINARY_CLOUD_NAME, "
79
+ "CLOUDINARY_API_KEY, CLOUDINARY_API_SECRET environment variables "
80
+ "or pass them as arguments."
81
+ )
82
+
83
+ self.folder = folder
84
+ self.max_retries = max_retries
85
+ self.timeout_seconds = timeout_seconds
86
+ self.jpeg_quality = jpeg_quality
87
+
88
+ # Check dependency
89
+ try:
90
+ import cloudinary # noqa
91
+ except ImportError:
92
+ raise ImportError(
93
+ "Cloudinary not installed. "
94
+ "Install with: pip install visual-rag-toolkit[cloudinary]"
95
+ )
96
+
97
+ logger.info("☁️ Cloudinary uploader initialized")
98
+ logger.info(f" Folder: {folder}")
99
+
100
+ def upload(
101
+ self,
102
+ image: Image.Image,
103
+ public_id: str,
104
+ subfolder: Optional[str] = None,
105
+ ) -> Optional[str]:
106
+ """
107
+ Upload a single image to Cloudinary.
108
+
109
+ Args:
110
+ image: PIL Image to upload
111
+ public_id: Public ID (filename without extension)
112
+ subfolder: Optional subfolder within base folder
113
+
114
+ Returns:
115
+ Secure URL of uploaded image, or None if failed
116
+ """
117
+ import cloudinary
118
+ import cloudinary.uploader
119
+
120
+ # Prepare buffer
121
+ buffer = io.BytesIO()
122
+ image.save(buffer, format="JPEG", quality=self.jpeg_quality, optimize=True)
123
+
124
+ # Configure Cloudinary
125
+ cloudinary.config(
126
+ cloud_name=self.cloud_name,
127
+ api_key=self.api_key,
128
+ api_secret=self.api_secret,
129
+ )
130
+
131
+ # Build folder path
132
+ folder_path = self.folder
133
+ if subfolder:
134
+ folder_path = f"{self.folder}/{subfolder}"
135
+
136
+ def do_upload():
137
+ buffer.seek(0)
138
+ result = cloudinary.uploader.upload(
139
+ buffer,
140
+ folder=folder_path,
141
+ overwrite=True,
142
+ public_id=public_id,
143
+ resource_type="image",
144
+ timeout=self.timeout_seconds,
145
+ )
146
+ return result["secure_url"]
147
+
148
+ # Use thread-safe mode for Streamlit/Flask/threaded contexts
149
+ # Set VISUAL_RAG_THREAD_SAFE=1 to enable
150
+ if THREAD_SAFE_MODE or threading.current_thread() is not threading.main_thread():
151
+ return self._upload_with_thread_timeout(do_upload, public_id)
152
+ else:
153
+ return self._upload_with_signal_timeout(do_upload, public_id)
154
+
155
+ def _upload_with_thread_timeout(self, do_upload, public_id: str) -> Optional[str]:
156
+ """Thread-safe upload with ThreadPoolExecutor timeout."""
157
+ for attempt in range(self.max_retries):
158
+ try:
159
+ with ThreadPoolExecutor(max_workers=1) as executor:
160
+ future = executor.submit(do_upload)
161
+ return future.result(timeout=self.timeout_seconds)
162
+
163
+ except FuturesTimeoutError:
164
+ logger.warning(
165
+ f"Upload timeout (attempt {attempt + 1}/{self.max_retries}): {public_id}"
166
+ )
167
+ if attempt < self.max_retries - 1:
168
+ time.sleep(2**attempt)
169
+
170
+ except Exception as e:
171
+ logger.warning(f"Upload failed (attempt {attempt + 1}/{self.max_retries}): {e}")
172
+ if attempt < self.max_retries - 1:
173
+ time.sleep(2**attempt)
174
+
175
+ logger.error(f"❌ Upload failed after {self.max_retries} attempts: {public_id}")
176
+ return None
177
+
178
+ def _upload_with_signal_timeout(self, do_upload, public_id: str) -> Optional[str]:
179
+ """Signal-based upload timeout (main thread only, Unix/macOS)."""
180
+ use_timeout = platform.system() != "Windows"
181
+
182
+ class SignalTimeoutError(Exception):
183
+ pass
184
+
185
+ def timeout_handler(signum, frame):
186
+ raise SignalTimeoutError(f"Upload timed out after {self.timeout_seconds}s")
187
+
188
+ for attempt in range(self.max_retries):
189
+ try:
190
+ if use_timeout:
191
+ old_handler = signal.signal(signal.SIGALRM, timeout_handler)
192
+ signal.alarm(self.timeout_seconds)
193
+
194
+ try:
195
+ return do_upload()
196
+ finally:
197
+ if use_timeout:
198
+ signal.alarm(0)
199
+ signal.signal(signal.SIGALRM, old_handler)
200
+
201
+ except SignalTimeoutError:
202
+ logger.warning(
203
+ f"Upload timeout (attempt {attempt + 1}/{self.max_retries}): {public_id}"
204
+ )
205
+ if attempt < self.max_retries - 1:
206
+ time.sleep(2**attempt)
207
+
208
+ except Exception as e:
209
+ logger.warning(f"Upload failed (attempt {attempt + 1}/{self.max_retries}): {e}")
210
+ if attempt < self.max_retries - 1:
211
+ time.sleep(2**attempt)
212
+
213
+ logger.error(f"❌ Upload failed after {self.max_retries} attempts: {public_id}")
214
+ return None
215
+
216
+ def upload_original_and_resized(
217
+ self,
218
+ original_image: Image.Image,
219
+ resized_image: Image.Image,
220
+ base_public_id: str,
221
+ ) -> tuple:
222
+ """
223
+ Upload both original and resized versions.
224
+
225
+ Args:
226
+ original_image: Original PDF page image
227
+ resized_image: Resized image for ColPali
228
+ base_public_id: Base public ID (e.g., "doc_page_1")
229
+
230
+ Returns:
231
+ Tuple of (original_url, resized_url) - either can be None on failure
232
+ """
233
+ original_url = self.upload(
234
+ original_image,
235
+ base_public_id,
236
+ subfolder="original",
237
+ )
238
+
239
+ resized_url = self.upload(
240
+ resized_image,
241
+ base_public_id,
242
+ subfolder="resized",
243
+ )
244
+
245
+ return original_url, resized_url
246
+
247
+ def upload_original_cropped_and_resized(
248
+ self,
249
+ original_image: Image.Image,
250
+ cropped_image: Optional[Image.Image],
251
+ resized_image: Image.Image,
252
+ base_public_id: str,
253
+ ) -> tuple:
254
+ original_url = self.upload(
255
+ original_image,
256
+ base_public_id,
257
+ subfolder="original",
258
+ )
259
+
260
+ cropped_url = None
261
+ if cropped_image is not None:
262
+ cropped_url = self.upload(
263
+ cropped_image,
264
+ base_public_id,
265
+ subfolder="cropped",
266
+ )
267
+
268
+ resized_url = self.upload(
269
+ resized_image,
270
+ base_public_id,
271
+ subfolder="resized",
272
+ )
273
+
274
+ return original_url, cropped_url, resized_url
@@ -0,0 +1,324 @@
1
+ """
2
+ PDF Processor - Convert PDFs to images and extract text.
3
+
4
+ This module works INDEPENDENTLY of embedding and vector storage.
5
+ Use it if you just need PDF → images conversion.
6
+
7
+ Features:
8
+ - Batch processing to save memory
9
+ - Text extraction with surrogate character handling
10
+ - Configurable DPI and quality settings
11
+ """
12
+
13
+ import gc
14
+ import logging
15
+ import re
16
+ from pathlib import Path
17
+ from typing import Any, Dict, Generator, List, Optional, Tuple
18
+
19
+ from PIL import Image
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class PDFProcessor:
25
+ """
26
+ Process PDFs into images and text for visual retrieval.
27
+
28
+ Works independently - no embedding or storage dependencies.
29
+
30
+ Args:
31
+ dpi: DPI for image conversion (higher = better quality)
32
+ output_format: Image format (RGB, L, etc.)
33
+ page_batch_size: Pages per batch for memory efficiency
34
+
35
+ Example:
36
+ >>> processor = PDFProcessor(dpi=140)
37
+ >>>
38
+ >>> # Convert single PDF
39
+ >>> images, texts = processor.process_pdf(Path("report.pdf"))
40
+ >>>
41
+ >>> # Stream large PDFs
42
+ >>> for images, texts in processor.stream_pdf(Path("large.pdf"), batch_size=10):
43
+ ... # Process each batch
44
+ ... pass
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ dpi: int = 140,
50
+ output_format: str = "RGB",
51
+ page_batch_size: int = 50,
52
+ ):
53
+ self.dpi = dpi
54
+ self.output_format = output_format
55
+ self.page_batch_size = page_batch_size
56
+
57
+ # PDF deps are optional: we only require them when calling PDF-specific methods.
58
+ # This keeps the class usable for helper utilities like `resize_for_colpali()`
59
+ # even in minimal installs.
60
+ self._pdf_deps_available = True
61
+ try:
62
+ import pdf2image # noqa: F401
63
+ import pypdf # noqa: F401
64
+ except Exception:
65
+ self._pdf_deps_available = False
66
+
67
+ def _require_pdf_deps(self) -> None:
68
+ if not self._pdf_deps_available:
69
+ raise ImportError(
70
+ "PDF processing requires `pdf2image` and `pypdf`.\n"
71
+ 'Install with: pip install "visual-rag-toolkit[pdf]"'
72
+ )
73
+
74
+ def process_pdf(
75
+ self,
76
+ pdf_path: Path,
77
+ dpi: Optional[int] = None,
78
+ ) -> Tuple[List[Image.Image], List[str]]:
79
+ """
80
+ Convert PDF to images and extract text.
81
+
82
+ Args:
83
+ pdf_path: Path to PDF file
84
+ dpi: Override default DPI
85
+
86
+ Returns:
87
+ Tuple of (list of images, list of page texts)
88
+ """
89
+ self._require_pdf_deps()
90
+ from pdf2image import convert_from_path
91
+ from pypdf import PdfReader
92
+
93
+ dpi = dpi or self.dpi
94
+ pdf_path = Path(pdf_path)
95
+
96
+ logger.info(f"📄 Processing PDF: {pdf_path.name}")
97
+
98
+ # Extract text
99
+ reader = PdfReader(str(pdf_path))
100
+ total_pages = len(reader.pages)
101
+
102
+ page_texts = []
103
+ for page in reader.pages:
104
+ text = page.extract_text() or ""
105
+ # Handle surrogate characters
106
+ text = self._sanitize_text(text)
107
+ page_texts.append(text)
108
+
109
+ # Convert to images in batches
110
+ all_images = []
111
+ for start_page in range(1, total_pages + 1, self.page_batch_size):
112
+ end_page = min(start_page + self.page_batch_size - 1, total_pages)
113
+
114
+ batch_images = convert_from_path(
115
+ str(pdf_path),
116
+ dpi=dpi,
117
+ fmt=self.output_format.lower(),
118
+ first_page=start_page,
119
+ last_page=end_page,
120
+ )
121
+
122
+ all_images.extend(batch_images)
123
+
124
+ del batch_images
125
+ gc.collect()
126
+
127
+ assert len(all_images) == len(
128
+ page_texts
129
+ ), f"Mismatch: {len(all_images)} images vs {len(page_texts)} texts"
130
+
131
+ logger.info(f"✅ Processed {len(all_images)} pages")
132
+ return all_images, page_texts
133
+
134
+ def stream_pdf(
135
+ self,
136
+ pdf_path: Path,
137
+ batch_size: int = 10,
138
+ dpi: Optional[int] = None,
139
+ ) -> Generator[Tuple[List[Image.Image], List[str], int], None, None]:
140
+ """
141
+ Stream PDF processing for large files.
142
+
143
+ Yields batches of (images, texts, start_page) without loading
144
+ entire PDF into memory.
145
+
146
+ Args:
147
+ pdf_path: Path to PDF file
148
+ batch_size: Pages per batch
149
+ dpi: Override default DPI
150
+
151
+ Yields:
152
+ Tuple of (batch_images, batch_texts, start_page_number)
153
+ """
154
+ self._require_pdf_deps()
155
+ from pdf2image import convert_from_path
156
+ from pypdf import PdfReader
157
+
158
+ dpi = dpi or self.dpi
159
+ pdf_path = Path(pdf_path)
160
+
161
+ reader = PdfReader(str(pdf_path))
162
+ total_pages = len(reader.pages)
163
+
164
+ logger.info(f"📄 Streaming PDF: {pdf_path.name} ({total_pages} pages)")
165
+
166
+ for start_idx in range(0, total_pages, batch_size):
167
+ end_idx = min(start_idx + batch_size, total_pages)
168
+
169
+ # Extract text for batch
170
+ batch_texts = []
171
+ for page_idx in range(start_idx, end_idx):
172
+ text = reader.pages[page_idx].extract_text() or ""
173
+ text = self._sanitize_text(text)
174
+ batch_texts.append(text)
175
+
176
+ # Convert images for batch
177
+ batch_images = convert_from_path(
178
+ str(pdf_path),
179
+ dpi=dpi,
180
+ fmt=self.output_format.lower(),
181
+ first_page=start_idx + 1, # 1-indexed
182
+ last_page=end_idx,
183
+ )
184
+
185
+ yield batch_images, batch_texts, start_idx + 1
186
+
187
+ del batch_images
188
+ gc.collect()
189
+
190
+ def get_page_count(self, pdf_path: Path) -> int:
191
+ """Get number of pages in PDF without loading images."""
192
+ self._require_pdf_deps()
193
+ from pypdf import PdfReader
194
+
195
+ reader = PdfReader(str(pdf_path))
196
+ return len(reader.pages)
197
+
198
+ def resize_for_colpali(
199
+ self,
200
+ image: Image.Image,
201
+ max_edge: int = 2048,
202
+ tile_size: int = 512,
203
+ ) -> Tuple[Image.Image, int, int]:
204
+ """
205
+ Resize image following ColPali/Idefics3 processor logic.
206
+
207
+ Resizes to fit within tile grid without black padding.
208
+
209
+ Args:
210
+ image: PIL Image
211
+ max_edge: Maximum edge length
212
+ tile_size: Size of each tile
213
+
214
+ Returns:
215
+ Tuple of (resized_image, tile_rows, tile_cols)
216
+ """
217
+ # Ensure consistent mode for downstream processors (and predictable tests)
218
+ if image.mode != "RGB":
219
+ image = image.convert("RGB")
220
+
221
+ w, h = image.size
222
+
223
+ # Step 1: Resize so longest edge = max_edge
224
+ if w > h:
225
+ new_w = max_edge
226
+ new_h = int(h * (max_edge / w))
227
+ else:
228
+ new_h = max_edge
229
+ new_w = int(w * (max_edge / h))
230
+
231
+ # Step 2: Calculate tile grid
232
+ tile_cols = (new_w + tile_size - 1) // tile_size
233
+ tile_rows = (new_h + tile_size - 1) // tile_size
234
+
235
+ # Step 3: Calculate exact dimensions for tiles
236
+ final_w = tile_cols * tile_size
237
+ final_h = tile_rows * tile_size
238
+
239
+ # Step 4: Scale to fit within tile grid
240
+ scale_w = final_w / w
241
+ scale_h = final_h / h
242
+ scale = min(scale_w, scale_h)
243
+
244
+ scaled_w = int(w * scale)
245
+ scaled_h = int(h * scale)
246
+
247
+ resized = image.resize((scaled_w, scaled_h), Image.LANCZOS)
248
+
249
+ # Center on white canvas if needed
250
+ if scaled_w != final_w or scaled_h != final_h:
251
+ canvas = Image.new("RGB", (final_w, final_h), (255, 255, 255))
252
+ offset_x = (final_w - scaled_w) // 2
253
+ offset_y = (final_h - scaled_h) // 2
254
+ canvas.paste(resized, (offset_x, offset_y))
255
+ resized = canvas
256
+
257
+ return resized, tile_rows, tile_cols
258
+
259
+ def _sanitize_text(self, text: str) -> str:
260
+ """Remove invalid Unicode characters (surrogates) from text."""
261
+ if not text:
262
+ return ""
263
+
264
+ # Remove surrogate characters (U+D800-U+DFFF)
265
+ return text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="ignore")
266
+
267
+ def extract_metadata_from_filename(
268
+ self,
269
+ filename: str,
270
+ mapping: Optional[Dict[str, Dict[str, Any]]] = None,
271
+ ) -> Dict[str, Any]:
272
+ """
273
+ Extract metadata from PDF filename.
274
+
275
+ Uses mapping if provided, otherwise falls back to pattern matching.
276
+
277
+ Args:
278
+ filename: PDF filename (with or without .pdf extension)
279
+ mapping: Optional mapping dict {filename: metadata}
280
+
281
+ Returns:
282
+ Metadata dict with year, source, district, etc.
283
+ """
284
+ # Remove extension
285
+ stem = Path(filename).stem
286
+ stem_lower = stem.lower().strip()
287
+
288
+ # Try mapping first
289
+ if mapping:
290
+ if stem_lower in mapping:
291
+ return mapping[stem_lower].copy()
292
+
293
+ # Try without .pdf
294
+ stem_no_ext = stem_lower.replace(".pdf", "")
295
+ if stem_no_ext in mapping:
296
+ return mapping[stem_no_ext].copy()
297
+
298
+ # Fallback: pattern matching
299
+ metadata = {"filename": filename}
300
+
301
+ # Extract year
302
+ year_match = re.search(r"(20\d{2})", stem)
303
+ if year_match:
304
+ metadata["year"] = int(year_match.group(1))
305
+
306
+ # Detect source type
307
+ if "consolidated" in stem_lower or ("annual" in stem_lower and "oag" in stem_lower):
308
+ metadata["source"] = "Consolidated"
309
+ elif "dlg" in stem_lower or "district local government" in stem_lower:
310
+ metadata["source"] = "Local Government"
311
+ # Try to extract district name
312
+ district_match = re.search(r"([a-z]+)\s+(?:dlg|district local government)", stem_lower)
313
+ if district_match:
314
+ metadata["district"] = district_match.group(1).title()
315
+ elif "hospital" in stem_lower or "referral" in stem_lower:
316
+ metadata["source"] = "Hospital"
317
+ elif "ministry" in stem_lower:
318
+ metadata["source"] = "Ministry"
319
+ elif "project" in stem_lower:
320
+ metadata["source"] = "Project"
321
+ else:
322
+ metadata["source"] = "Unknown"
323
+
324
+ return metadata