memra 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. memra/cli.py +114 -8
  2. memra/demos/etl_invoice_processing/check_after_workflow.py +50 -0
  3. memra/demos/etl_invoice_processing/check_database.py +44 -0
  4. memra/demos/etl_invoice_processing/check_recent_db.py +42 -0
  5. memra/demos/etl_invoice_processing/data/README.md +112 -0
  6. memra/demos/etl_invoice_processing/data/invoices/10352259401.PDF +0 -0
  7. memra/demos/etl_invoice_processing/data/invoices/10352259823.PDF +0 -0
  8. memra/demos/etl_invoice_processing/data/invoices/10352260169.PDF +0 -0
  9. memra/demos/etl_invoice_processing/data/invoices/10352260417.PDF +0 -0
  10. memra/demos/etl_invoice_processing/data/invoices/10352260599.PDF +0 -0
  11. memra/demos/etl_invoice_processing/data/invoices/10352260912.PDF +0 -0
  12. memra/demos/etl_invoice_processing/data/invoices/10352261134.PDF +0 -0
  13. memra/demos/etl_invoice_processing/data/invoices/10352261563.PDF +0 -0
  14. memra/demos/etl_invoice_processing/data/invoices/10352261647.PDF +0 -0
  15. memra/demos/etl_invoice_processing/data/invoices/10352261720.PDF +0 -0
  16. memra/demos/etl_invoice_processing/data/invoices/10352261811.PDF +0 -0
  17. memra/demos/etl_invoice_processing/data/invoices/10352262025.PDF +0 -0
  18. memra/demos/etl_invoice_processing/data/invoices/10352262454.PDF +0 -0
  19. memra/demos/etl_invoice_processing/data/invoices/10352262702.PDF +0 -0
  20. memra/demos/etl_invoice_processing/data/invoices/10352262884.PDF +0 -0
  21. memra/demos/etl_invoice_processing/data/invoices/10352263346.PDF +0 -0
  22. memra/demos/etl_invoice_processing/data/invoices/10352263429.PDF +0 -0
  23. memra/demos/etl_invoice_processing/database_monitor_agent.py +89 -0
  24. memra/demos/etl_invoice_processing/debug_mcp.py +66 -0
  25. memra/demos/etl_invoice_processing/debug_schema.py +45 -0
  26. memra/demos/etl_invoice_processing/etl_invoice_demo.py +1233 -0
  27. memra/demos/etl_invoice_processing/modify_database.py +65 -0
  28. memra/demos/etl_invoice_processing/run_etl_batch.py +60 -0
  29. memra/demos/etl_invoice_processing/setup_demo_data.py +154 -0
  30. memra/demos/etl_invoice_processing/simple_pdf_processor.py +181 -0
  31. memra/demos/etl_invoice_processing/test_agent3.py +56 -0
  32. memra/demos/etl_invoice_processing/test_agent3_v2.py +32 -0
  33. memra/demos/etl_invoice_processing/test_api.py +28 -0
  34. memra/demos/etl_invoice_processing/test_api_client_direct.py +89 -0
  35. memra/demos/etl_invoice_processing/test_conversion.py +172 -0
  36. memra/demos/etl_invoice_processing/test_debug.py +41 -0
  37. memra/demos/etl_invoice_processing/test_direct_vision.py +114 -0
  38. memra/demos/etl_invoice_processing/test_full_response.py +22 -0
  39. memra/demos/etl_invoice_processing/test_memra_response.py +124 -0
  40. memra/demos/etl_invoice_processing/test_pdf_processor_response.py +118 -0
  41. memra/demos/etl_invoice_processing/test_pdfprocessor_direct.py +96 -0
  42. memra/demos/etl_invoice_processing/test_postgres_insert.py +120 -0
  43. memra/demos/etl_invoice_processing/test_remote_upload.py +143 -0
  44. memra/demos/etl_invoice_processing/test_schema_format.py +39 -0
  45. memra/demos/etl_invoice_processing/test_sql_executor.py +58 -0
  46. memra/demos/etl_invoice_processing/test_sql_executor_extra_fields.py +61 -0
  47. memra/demos/etl_invoice_processing/test_sql_executor_fix.py +40 -0
  48. memra/demos/etl_invoice_processing/test_updated_server.py +50 -0
  49. memra/demos/etl_invoice_processing/test_upload_functionality.py +156 -0
  50. memra/demos/etl_invoice_processing/test_upload_server.py +232 -0
  51. memra/demos/etl_invoice_processing/test_vision_output.py +75 -0
  52. memra/demos/etl_invoice_processing/test_vision_prompt.py +43 -0
  53. memra/demos/etl_invoice_processing/test_vision_simple.py +60 -0
  54. {memra-0.2.12.dist-info → memra-0.2.13.dist-info}/METADATA +53 -78
  55. memra-0.2.13.dist-info/RECORD +120 -0
  56. {memra-0.2.12.dist-info → memra-0.2.13.dist-info}/WHEEL +1 -1
  57. memra-0.2.12.dist-info/RECORD +0 -68
  58. {memra-0.2.12.dist-info/licenses → memra-0.2.13.dist-info}/LICENSE +0 -0
  59. {memra-0.2.12.dist-info → memra-0.2.13.dist-info}/entry_points.txt +0 -0
  60. {memra-0.2.12.dist-info → memra-0.2.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1233 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ETL Invoice Processing Demo
4
+ Complete ETL workflow with database monitoring before and after
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import time
10
+ import random
11
+ from pathlib import Path
12
+ from memra import Agent, Department, LLM, check_api_health, get_api_status
13
+ from memra.execution import ExecutionEngine, ExecutionTrace
14
+ from database_monitor_agent import create_simple_monitor_agent, get_monitoring_queries
15
+ import glob
16
+ import requests
17
+ import base64
18
+ import json
19
+
20
+ # Set API key for authentication
21
+ os.environ['MEMRA_API_KEY'] = 'test-secret-for-development'
22
+ os.environ['MEMRA_API_URL'] = 'https://api.memra.co'
23
+
24
+ # Add the parent directory to the path so we can import memra
25
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
26
+
27
+ # Check for required API key
28
+ if not os.getenv("MEMRA_API_KEY"):
29
+ print("❌ Error: MEMRA_API_KEY environment variable is required")
30
+ print("Please set your API key: export MEMRA_API_KEY='test-secret-for-development'")
31
+ print("Using local MCP bridge server")
32
+ sys.exit(1)
33
+
34
+ # Set API configuration - using remote API for all operations including PDF processing
35
+ os.environ["MEMRA_API_URL"] = "https://api.memra.co"
36
+
37
+ # Store the remote API URL for PDF processing
38
+ REMOTE_API_URL = "https://api.memra.co"
39
+
40
+ # Define the specific 15 files to process
41
+ TARGET_FILES = [
42
+ "10352259401.PDF",
43
+ "10352259823.PDF",
44
+ "10352260169.PDF",
45
+ "10352260417.PDF",
46
+ "10352260599.PDF",
47
+ "10352260912.PDF",
48
+ "10352261134.PDF",
49
+ "10352261563.PDF",
50
+ "10352261647.PDF",
51
+ "10352261720.PDF",
52
+ "10352261811.PDF",
53
+ "10352262025.PDF",
54
+ "10352262454.PDF",
55
+ "10352262702.PDF",
56
+ "10352262884.PDF"
57
+ ]
58
+
59
+ # Configuration for robust processing
60
+ PROCESSING_CONFIG = {
61
+ "delay_between_files": 2.5, # seconds
62
+ "max_retries": 3,
63
+ "retry_delay_base": 2, # seconds
64
+ "retry_delay_max": 30, # seconds
65
+ "timeout_seconds": 120,
66
+ "rate_limit_delay": 5 # additional delay if rate limited
67
+ }
68
+
69
+ # Check API health before starting
70
+ print("🔍 Checking Memra API status...")
71
+ api_status = get_api_status()
72
+ print(f"API Health: {'✅ Healthy' if api_status['api_healthy'] else '❌ Unavailable'}")
73
+ print(f"API URL: {api_status['api_url']}")
74
+ print(f"Tools Available: {api_status['tools_available']}")
75
+
76
+ if not api_status['api_healthy']:
77
+ print("❌ Cannot proceed - Memra API is not available")
78
+ sys.exit(1)
79
+
80
+ # Define LLMs
81
+ default_llm = LLM(
82
+ model="llama-3.2-11b-vision-preview",
83
+ temperature=0.1,
84
+ max_tokens=2000
85
+ )
86
+
87
+ parsing_llm = LLM(
88
+ model="llama-3.2-11b-vision-preview",
89
+ temperature=0.0,
90
+ max_tokens=4000
91
+ )
92
+
93
+ manager_llm = LLM(
94
+ model="llama-3.2-11b-vision-preview",
95
+ temperature=0.2,
96
+ max_tokens=1500
97
+ )
98
+
99
+ # Define agents
100
+ pre_monitor_agent = create_simple_monitor_agent()
101
+ pre_monitor_agent.role = "Pre-ETL Database Monitor"
102
+
103
+ etl_agent = Agent(
104
+ role="Data Engineer",
105
+ job="Extract invoice schema from database",
106
+ llm=default_llm,
107
+ sops=[
108
+ "Connect to database using provided connection string",
109
+ "Generate SQL query: SELECT column_name, data_type, is_nullable, column_default FROM information_schema.columns WHERE table_name = 'invoices' ORDER BY ordinal_position",
110
+ "Execute the generated SQL query using SQLExecutor tool",
111
+ "Extract column names, types, and constraints from results",
112
+ "Return schema as structured JSON with column information"
113
+ ],
114
+ systems=["Database"],
115
+ tools=[
116
+ {"name": "SQLExecutor", "hosted_by": "mcp", "input_keys": ["sql_query"]}
117
+ ],
118
+ input_keys=["connection", "table_name", "sql_query"],
119
+ output_key="invoice_schema"
120
+ )
121
+
122
+ def convert_vision_response_to_extracted_data(vision_response: str) -> dict:
123
+ """Convert vision model response to extracted_data format"""
124
+ try:
125
+ # Clean up the response - remove markdown code blocks if present
126
+ if vision_response.startswith("```json"):
127
+ vision_response = vision_response.replace("```json", "").replace("```", "").strip()
128
+
129
+ # Parse the JSON response
130
+ data = json.loads(vision_response)
131
+
132
+ # Extract fields with fallback to different naming conventions
133
+ invoice_number = (
134
+ data.get("invoice_number") or
135
+ data.get("InvoiceNumber") or
136
+ data.get("invoiceNumber") or
137
+ ""
138
+ )
139
+
140
+ invoice_date = (
141
+ data.get("invoice_date") or
142
+ data.get("InvoiceDate") or
143
+ data.get("invoiceDate") or
144
+ ""
145
+ )
146
+
147
+ # Convert date format if needed
148
+ if invoice_date:
149
+ if "/" in invoice_date and len(invoice_date.split("/")) == 3:
150
+ parts = invoice_date.split("/")
151
+ month, day, year = parts[0], parts[1], parts[2]
152
+ if len(year) == 2:
153
+ year = "20" + year
154
+ invoice_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
155
+
156
+ amount = (
157
+ data.get("total_amount") or # Add this - matches vision model output
158
+ data.get("amount") or
159
+ data.get("InvoiceTotal") or
160
+ data.get("invoiceTotal") or
161
+ data.get("total") or
162
+ 0
163
+ )
164
+
165
+ vendor_name = (
166
+ data.get("vendor_name") or
167
+ data.get("VendorName") or
168
+ data.get("vendorName") or
169
+ data.get("Company") or
170
+ data.get("company") or
171
+ data.get("Vendor") or
172
+ data.get("vendor") or
173
+ ""
174
+ )
175
+
176
+ # If vendor not found, try to infer from the data
177
+ if not vendor_name:
178
+ # Check if items mention specific vendors
179
+ items = data.get("Items") or data.get("Order") or data.get("items") or []
180
+ for item in items:
181
+ desc = item.get("Description", "").upper()
182
+ if "PROPANE" in desc:
183
+ vendor_name = "Superior Propane"
184
+ break
185
+
186
+ tax_amount = (
187
+ data.get("tax_amount") or
188
+ data.get("TaxAmount") or
189
+ data.get("taxAmount") or
190
+ 0
191
+ )
192
+
193
+ due_date = (
194
+ data.get("due_date") or
195
+ data.get("DueDate") or
196
+ data.get("dueDate") or
197
+ ""
198
+ )
199
+
200
+ line_items = (
201
+ data.get("line_items") or
202
+ data.get("Order") or
203
+ data.get("order") or
204
+ data.get("LineItems") or
205
+ data.get("lineItems") or
206
+ []
207
+ )
208
+
209
+ # Convert to expected format
210
+ extracted_data = {
211
+ "headerSection": {
212
+ "vendorName": vendor_name,
213
+ "subtotal": float(amount)
214
+ },
215
+ "billingDetails": {
216
+ "invoiceNumber": invoice_number,
217
+ "invoiceDate": invoice_date,
218
+ "dueDate": due_date
219
+ },
220
+ "chargesSummary": {
221
+ "document_total": float(amount),
222
+ "secondary_tax": float(tax_amount),
223
+ "lineItemsBreakdown": line_items
224
+ },
225
+ "status": "processed"
226
+ }
227
+
228
+ return extracted_data
229
+
230
+ except Exception as e:
231
+ print(f"⚠️ Error converting vision response: {e}")
232
+ return {
233
+ "headerSection": {"vendorName": "", "subtotal": 0.0},
234
+ "billingDetails": {"invoiceNumber": "", "invoiceDate": "", "dueDate": ""},
235
+ "chargesSummary": {"document_total": 0.0, "secondary_tax": 0.0, "lineItemsBreakdown": []},
236
+ "status": "conversion_error"
237
+ }
238
+
239
+ def pdf_processing_with_remote_api(agent, tool_results, **kwargs):
240
+ """Custom processing function that switches to remote API for PDF processing"""
241
+ print("\n[DEBUG] pdf_processing_with_remote_api function called!")
242
+ print(f"[DEBUG] Agent: {agent.role}")
243
+ print(f"[DEBUG] Tool results keys: {list(tool_results.keys())}")
244
+ import json
245
+ original_url = switch_to_remote_api_for_pdf()
246
+ try:
247
+ for tool_name, result_data in tool_results.items():
248
+ if tool_name == "PDFProcessor":
249
+ print("\n[DEBUG] Full PDFProcessor result_data:")
250
+ try:
251
+ print(json.dumps(result_data, indent=2, default=str))
252
+ except Exception as e:
253
+ print(f"[DEBUG] Could not serialize result_data: {e}")
254
+ print(result_data)
255
+ if tool_name == "PDFProcessor" and result_data.get("success"):
256
+ data = result_data.get("data", {})
257
+ # Double-nested
258
+ if "data" in data and isinstance(data["data"], dict):
259
+ inner_data = data["data"]
260
+ if "data" in inner_data and isinstance(inner_data["data"], dict):
261
+ actual_data = inner_data["data"]
262
+ extracted = actual_data.get("extracted_data", {})
263
+ vision_response = actual_data.get("vision_response")
264
+ if vision_response and (not extracted or not extracted.get("headerSection")):
265
+ converted_data = convert_vision_response_to_extracted_data(vision_response)
266
+ actual_data["extracted_data"] = converted_data
267
+ print(f"\n🔄 [PATCHED] Applied field mapping conversion to {tool_name} (double-nested)")
268
+ print(f" Invoice #: {converted_data['billingDetails']['invoiceNumber']}")
269
+ print(f" Amount: ${converted_data['chargesSummary']['document_total']}")
270
+ # Always print the raw JSON response
271
+ if vision_response:
272
+ print("\n📝 [AGENT 3] Vision Model Raw JSON Response:")
273
+ try:
274
+ parsed = json.loads(vision_response.replace('```json','').replace('```','').strip())
275
+ print(json.dumps(parsed, indent=2))
276
+ except Exception:
277
+ print(vision_response)
278
+ else:
279
+ extracted = inner_data.get("extracted_data", {})
280
+ vision_response = inner_data.get("vision_response")
281
+ if vision_response and (not extracted or not extracted.get("headerSection")):
282
+ converted_data = convert_vision_response_to_extracted_data(vision_response)
283
+ inner_data["extracted_data"] = converted_data
284
+ print(f"\n🔄 [PATCHED] Applied field mapping conversion to {tool_name} (single-nested)")
285
+ print(f" Invoice #: {converted_data['billingDetails']['invoiceNumber']}")
286
+ print(f" Amount: ${converted_data['chargesSummary']['document_total']}")
287
+ if vision_response:
288
+ print("\n📝 [AGENT 3] Vision Model Raw JSON Response:")
289
+ try:
290
+ parsed = json.loads(vision_response.replace('```json','').replace('```','').strip())
291
+ print(json.dumps(parsed, indent=2))
292
+ except Exception:
293
+ print(vision_response)
294
+ else:
295
+ extracted = data.get("extracted_data", {})
296
+ vision_response = data.get("vision_response")
297
+ if vision_response and (not extracted or not extracted.get("headerSection")):
298
+ converted_data = convert_vision_response_to_extracted_data(vision_response)
299
+ data["extracted_data"] = converted_data
300
+ print(f"\n🔄 [PATCHED] Applied field mapping conversion to {tool_name} (direct)")
301
+ print(f" Invoice #: {converted_data['billingDetails']['invoiceNumber']}")
302
+ print(f" Amount: ${converted_data['chargesSummary']['document_total']}")
303
+ if vision_response:
304
+ print("\n📝 [AGENT 3] Vision Model Raw JSON Response:")
305
+ try:
306
+ parsed = json.loads(vision_response.replace('```json','').replace('```','').strip())
307
+ print(json.dumps(parsed, indent=2))
308
+ except Exception:
309
+ print(vision_response)
310
+ print_vision_model_data(agent, tool_results)
311
+ return tool_results
312
+ finally:
313
+ restore_api_url(original_url)
314
+
315
+ def fix_pdfprocessor_response(agent, result_data, **kwargs):
316
+ """Custom processing function that calls remote API for PDF processing and prints JSON"""
317
+ print(f"\n[DEBUG] fix_pdfprocessor_response called for {agent.role}")
318
+ print(f"[DEBUG] Result data type: {type(result_data)}")
319
+ print(f"[DEBUG] Result data: {result_data}")
320
+
321
+ # Get the file path from the result_data (the execution engine passes input data here)
322
+ file_path = result_data.get('file', '')
323
+ print(f"[DEBUG] File path from result_data: {file_path}")
324
+
325
+ if not file_path:
326
+ print("❌ No file path provided in result_data")
327
+ print(f"[DEBUG] Available keys in result_data: {list(result_data.keys())}")
328
+ return result_data
329
+
330
+ try:
331
+ import requests
332
+ import json
333
+ import os
334
+ import base64
335
+
336
+ # Use the remote API for PDF processing
337
+ api_url = "https://api.memra.co"
338
+ api_key = os.getenv("MEMRA_API_KEY", "test-secret-for-development")
339
+
340
+ # Since the file is already uploaded and we have the remote path, use it directly
341
+ print(f"🔍 Using remote file path: {file_path}")
342
+
343
+ # Call the PDFProcessor with the remote path
344
+ print(f"🔍 Calling PDFProcessor with remote path...")
345
+
346
+ pdf_data = {
347
+ "file_path": file_path
348
+ }
349
+
350
+ response = requests.post(
351
+ f"{api_url}/tools/execute",
352
+ json={
353
+ "tool_name": "PDFProcessor",
354
+ "parameters": pdf_data
355
+ },
356
+ headers={
357
+ "X-API-Key": api_key,
358
+ "Content-Type": "application/json"
359
+ }
360
+ )
361
+
362
+ if response.status_code != 200:
363
+ print(f"❌ PDFProcessor call failed: {response.status_code}")
364
+ print(f" Response: {response.text}")
365
+ return result_data
366
+
367
+ pdf_result = response.json()
368
+ print(f"\n🎯 AGENT 3 - FULL PDFPROCESSOR RESPONSE:")
369
+ print("=" * 60)
370
+ print(json.dumps(pdf_result, indent=2, default=str))
371
+ print("=" * 60)
372
+
373
+ # Extract the vision response from the nested structure
374
+ vision_response = None
375
+ if pdf_result.get("success") and "data" in pdf_result:
376
+ data = pdf_result["data"]
377
+
378
+ # Check for nested data structure
379
+ if isinstance(data, dict) and "data" in data:
380
+ actual_data = data["data"]
381
+ if "vision_response" in actual_data:
382
+ vision_response = actual_data["vision_response"]
383
+ elif "vision_response" in data:
384
+ vision_response = data["vision_response"]
385
+
386
+ if vision_response:
387
+ print(f"\n🎯 AGENT 3 - RAW VISION MODEL JSON:")
388
+ print("=" * 60)
389
+ print(vision_response)
390
+ print("=" * 60)
391
+
392
+ # Try to parse the JSON response
393
+ try:
394
+ # Clean up the response - remove markdown code blocks if present
395
+ cleaned_response = vision_response
396
+ if cleaned_response.startswith("```json"):
397
+ cleaned_response = cleaned_response.replace("```json", "").replace("```", "").strip()
398
+ elif cleaned_response.startswith("```"):
399
+ cleaned_response = cleaned_response.replace("```", "").strip()
400
+
401
+ parsed_data = json.loads(cleaned_response)
402
+ print(f"\n✅ [AGENT 3] Successfully parsed JSON:")
403
+ print(json.dumps(parsed_data, indent=2))
404
+
405
+ # Convert to the expected format
406
+ extracted_data = convert_vision_response_to_extracted_data(cleaned_response)
407
+
408
+ # Debug vendor extraction
409
+ print(f"\n🔍 [AGENT 3] Extracted vendor: '{extracted_data['headerSection']['vendorName']}'")
410
+ print(f" Invoice #: {extracted_data['billingDetails']['invoiceNumber']}")
411
+ print(f" Amount: ${extracted_data['chargesSummary']['document_total']}")
412
+
413
+ # Update the result_data
414
+ result_data = {
415
+ "success": True,
416
+ "data": {
417
+ "vision_response": vision_response,
418
+ "extracted_data": extracted_data
419
+ },
420
+ "_memra_metadata": {
421
+ "agent_role": agent.role,
422
+ "tools_real_work": ["PDFProcessor"],
423
+ "tools_mock_work": [],
424
+ "work_quality": "real"
425
+ }
426
+ }
427
+
428
+ return result_data
429
+
430
+ except json.JSONDecodeError as e:
431
+ print(f"❌ JSON parsing error: {e}")
432
+ print(f"Raw response: {vision_response}")
433
+ return result_data
434
+ else:
435
+ print(f"❌ No vision_response found in PDFProcessor result")
436
+ return result_data
437
+
438
+ except Exception as e:
439
+ print(f"❌ Error in PDF processing: {e}")
440
+ return result_data
441
+
442
+ def direct_vision_processing(agent, result_data, **kwargs):
443
+ """Direct vision model processing without using tools with retry logic"""
444
+ print(f"\n[DEBUG] direct_vision_processing called for {agent.role}")
445
+ print(f"[DEBUG] Result data type: {type(result_data)}")
446
+ print(f"[DEBUG] Result data: {result_data}")
447
+ print(f"[DEBUG] Kwargs: {kwargs}")
448
+
449
+ # Get the file path from the input data - check kwargs['input'] first
450
+ input_data = kwargs.get('input', {})
451
+ file_path = input_data.get('file', '') or kwargs.get('file', '') or result_data.get('file', '')
452
+ print(f"[DEBUG] File path: {file_path}")
453
+
454
+ # Get the invoice schema from previous agent results
455
+ results = kwargs.get('results', {})
456
+ invoice_schema = results.get('invoice_schema', {})
457
+ schema_results = invoice_schema.get('results', [])
458
+ print(f"[DEBUG] Schema fields: {[col['column_name'] for col in schema_results]}")
459
+
460
+ if not file_path:
461
+ print("❌ No file path provided")
462
+ return result_data
463
+
464
+ # Retry logic for vision processing
465
+ for attempt in range(PROCESSING_CONFIG["max_retries"] + 1):
466
+ try:
467
+ import requests
468
+ import json
469
+ import os
470
+ import base64
471
+
472
+ # Use the remote API for PDF processing
473
+ api_url = "https://api.memra.co"
474
+ api_key = os.getenv("MEMRA_API_KEY", "test-secret-for-development")
475
+
476
+ # Check if file is already a remote path
477
+ if file_path.startswith('/uploads/'):
478
+ print(f"✅ File already uploaded to remote API: {file_path}")
479
+ remote_path = file_path
480
+ else:
481
+ # Local file - need to upload
482
+ print(f"📤 Uploading file to remote API (attempt {attempt + 1})...")
483
+
484
+ # Read the file and encode as base64
485
+ with open(file_path, 'rb') as f:
486
+ file_content = f.read()
487
+
488
+ file_b64 = base64.b64encode(file_content).decode('utf-8')
489
+
490
+ # Prepare upload data
491
+ upload_data = {
492
+ "filename": os.path.basename(file_path),
493
+ "content": file_b64,
494
+ "content_type": "application/pdf"
495
+ }
496
+
497
+ # Upload to remote API with timeout
498
+ response = requests.post(
499
+ f"{api_url}/upload",
500
+ json=upload_data,
501
+ headers={
502
+ "X-API-Key": api_key,
503
+ "Content-Type": "application/json"
504
+ },
505
+ timeout=PROCESSING_CONFIG["timeout_seconds"]
506
+ )
507
+
508
+ if response.status_code != 200:
509
+ print(f"❌ Upload failed: {response.status_code}")
510
+ print(f" Response: {response.text}")
511
+
512
+ # Check for rate limiting
513
+ if response.status_code == 429:
514
+ delay = PROCESSING_CONFIG["rate_limit_delay"] * (2 ** attempt)
515
+ print(f"⏳ Rate limited, waiting {delay}s before retry...")
516
+ time.sleep(delay)
517
+ continue
518
+ else:
519
+ return result_data
520
+
521
+ upload_result = response.json()
522
+ if not upload_result.get("success"):
523
+ print(f"❌ Upload failed: {upload_result.get('error')}")
524
+ return result_data
525
+
526
+ remote_path = upload_result["data"]["remote_path"]
527
+ print(f"✅ File uploaded successfully")
528
+ print(f" Remote path: {remote_path}")
529
+
530
+ # Now call the PDFProcessor with the remote path
531
+ print(f"🔍 Calling PDFProcessor with remote path (attempt {attempt + 1})...")
532
+
533
+ # Convert schema to format expected by PDFProcessor
534
+ schema_for_pdf = None
535
+ if schema_results:
536
+ # Send the raw schema array - server now handles both formats
537
+ schema_for_pdf = [
538
+ col for col in schema_results
539
+ if col["column_name"] not in ["id", "created_at", "updated_at", "status", "raw_json"]
540
+ ]
541
+ print(f"📋 Passing schema with {len(schema_for_pdf)} fields to PDFProcessor")
542
+ print(f"📋 Schema fields: {[c['column_name'] for c in schema_for_pdf]}")
543
+
544
+ response = requests.post(
545
+ f"{api_url}/tools/execute",
546
+ json={
547
+ "tool_name": "PDFProcessor",
548
+ "hosted_by": "memra",
549
+ "input_data": {
550
+ "file": remote_path,
551
+ "schema": schema_for_pdf
552
+ }
553
+ },
554
+ headers={
555
+ "X-API-Key": api_key,
556
+ "Content-Type": "application/json"
557
+ },
558
+ timeout=PROCESSING_CONFIG["timeout_seconds"]
559
+ )
560
+
561
+ if response.status_code != 200:
562
+ print(f"❌ PDFProcessor call failed: {response.status_code}")
563
+ print(f" Response: {response.text}")
564
+
565
+ # Check for rate limiting
566
+ if response.status_code == 429:
567
+ delay = PROCESSING_CONFIG["rate_limit_delay"] * (2 ** attempt)
568
+ print(f"⏳ Rate limited, waiting {delay}s before retry...")
569
+ time.sleep(delay)
570
+ continue
571
+ else:
572
+ return result_data
573
+
574
+ pdf_result = response.json()
575
+ print(f"\n🎯 AGENT 3 - FULL PDFPROCESSOR RESPONSE:")
576
+ print("=" * 60)
577
+ print(json.dumps(pdf_result, indent=2, default=str))
578
+ print("=" * 60)
579
+
580
+ # Extract the vision response from the nested structure
581
+ vision_response = None
582
+ if pdf_result.get("success") and "data" in pdf_result:
583
+ data = pdf_result["data"]
584
+
585
+ # Check for nested data structure
586
+ if isinstance(data, dict) and "data" in data:
587
+ actual_data = data["data"]
588
+ if "vision_response" in actual_data:
589
+ vision_response = actual_data["vision_response"]
590
+ elif "vision_response" in data:
591
+ vision_response = data["vision_response"]
592
+
593
+ if vision_response:
594
+ print(f"\n🎯 AGENT 3 - RAW VISION MODEL JSON:")
595
+ print("=" * 60)
596
+ print(vision_response)
597
+ print("=" * 60)
598
+
599
+ # Try to parse the JSON response
600
+ try:
601
+ # Clean up the response - remove markdown code blocks if present
602
+ cleaned_response = vision_response
603
+ if cleaned_response.startswith("```json"):
604
+ cleaned_response = cleaned_response.replace("```json", "").replace("```", "").strip()
605
+ elif cleaned_response.startswith("```"):
606
+ cleaned_response = cleaned_response.replace("```", "").strip()
607
+
608
+ parsed_data = json.loads(cleaned_response)
609
+ print(f"\n✅ [AGENT 3] Successfully parsed JSON:")
610
+ print(json.dumps(parsed_data, indent=2))
611
+
612
+ # Convert to the expected format
613
+ extracted_data = convert_vision_response_to_extracted_data(cleaned_response)
614
+
615
+ # Debug vendor extraction
616
+ print(f"\n🔍 [AGENT 3] Extracted vendor: '{extracted_data['headerSection']['vendorName']}'")
617
+ print(f" Invoice #: {extracted_data['billingDetails']['invoiceNumber']}")
618
+ print(f" Amount: ${extracted_data['chargesSummary']['document_total']}")
619
+
620
+ # Update the result_data
621
+ result_data = {
622
+ "success": True,
623
+ "data": {
624
+ "vision_response": vision_response,
625
+ "extracted_data": extracted_data
626
+ },
627
+ "_memra_metadata": {
628
+ "agent_role": agent.role,
629
+ "tools_real_work": ["PDFProcessor"],
630
+ "tools_mock_work": [],
631
+ "work_quality": "real"
632
+ }
633
+ }
634
+
635
+ return result_data
636
+
637
+ except json.JSONDecodeError as e:
638
+ print(f"❌ JSON parsing error: {e}")
639
+ print(f"Raw response: {vision_response}")
640
+
641
+ # Don't retry on JSON parsing errors
642
+ return result_data
643
+ else:
644
+ print(f"❌ No vision_response found in PDFProcessor result")
645
+
646
+ # Retry if no vision response (might be temporary API issue)
647
+ if attempt < PROCESSING_CONFIG["max_retries"]:
648
+ delay = PROCESSING_CONFIG["retry_delay_base"] * (2 ** attempt)
649
+ print(f"⏳ No vision response, waiting {delay}s before retry...")
650
+ time.sleep(delay)
651
+ continue
652
+ else:
653
+ return result_data
654
+
655
+ except requests.exceptions.Timeout:
656
+ print(f"⏰ Vision processing timeout (attempt {attempt + 1})")
657
+ if attempt < PROCESSING_CONFIG["max_retries"]:
658
+ delay = PROCESSING_CONFIG["retry_delay_base"] * (2 ** attempt)
659
+ print(f"⏳ Waiting {delay}s before retry...")
660
+ time.sleep(delay)
661
+ continue
662
+ except Exception as e:
663
+ print(f"❌ Error in PDF processing (attempt {attempt + 1}): {e}")
664
+ if attempt < PROCESSING_CONFIG["max_retries"]:
665
+ delay = PROCESSING_CONFIG["retry_delay_base"] * (2 ** attempt)
666
+ print(f"⏳ Waiting {delay}s before retry...")
667
+ time.sleep(delay)
668
+ continue
669
+
670
+ print(f"❌ Failed to process vision after {PROCESSING_CONFIG['max_retries'] + 1} attempts")
671
+ return result_data
672
+
673
+ # Create a new Agent 3 that bypasses the tool system
674
+ direct_vision_agent = Agent(
675
+ role="Invoice Parser",
676
+ job="Extract structured data from invoice PDF using vision model",
677
+ llm=parsing_llm,
678
+ sops=[
679
+ "Load invoice PDF file",
680
+ "Send to vision model for field extraction",
681
+ "Print out the raw JSON data returned by vision model tools",
682
+ "Validate extracted data against schema types",
683
+ "Return structured invoice data"
684
+ ],
685
+ systems=["InvoiceStore"],
686
+ tools=[], # No tools - we'll do direct API calls in custom processing
687
+ input_keys=["file", "invoice_schema"],
688
+ output_key="invoice_data",
689
+ custom_processing=direct_vision_processing
690
+ )
691
+
692
+ parser_agent = Agent(
693
+ role="Invoice Parser",
694
+ job="Extract structured data from invoice PDF using vision model",
695
+ llm=parsing_llm,
696
+ sops=[
697
+ "Load invoice PDF file",
698
+ "Send to vision model for field extraction",
699
+ "Print out the raw JSON data returned by vision model tools",
700
+ "Validate extracted data against schema types",
701
+ "Return structured invoice data"
702
+ ],
703
+ systems=["InvoiceStore"],
704
+ tools=[
705
+ {"name": "PDFProcessor", "hosted_by": "memra", "input_keys": ["file_path"]}
706
+ ],
707
+ input_keys=["file", "invoice_schema"],
708
+ output_key="invoice_data",
709
+ custom_processing=pdf_processing_with_remote_api
710
+ )
711
+
712
+ def process_database_insertion(agent, tool_results, **kwargs):
713
+ """Custom processing for Agent 4 to properly map invoice data to database format"""
714
+ print(f"\n[DEBUG] process_database_insertion called for {agent.role}")
715
+
716
+ # Get the invoice data from kwargs
717
+ input_data = kwargs.get('input', {})
718
+ results = kwargs.get('results', {})
719
+
720
+ # Try to get invoice_data from various sources
721
+ invoice_data = (
722
+ results.get('invoice_data') or
723
+ input_data.get('invoice_data') or
724
+ kwargs.get('invoice_data', {})
725
+ )
726
+
727
+ print(f"[DEBUG] Invoice data type: {type(invoice_data)}")
728
+ print(f"[DEBUG] Invoice data keys: {list(invoice_data.keys()) if isinstance(invoice_data, dict) else 'Not a dict'}")
729
+
730
+ # Transform the data for database insertion
731
+ if isinstance(invoice_data, dict):
732
+ # Create the properly formatted data for database
733
+ db_data = {}
734
+
735
+ # Check if data is in the new format (headerSection, billingDetails, etc.)
736
+ if 'headerSection' in invoice_data and 'billingDetails' in invoice_data:
737
+ header = invoice_data.get('headerSection', {})
738
+ billing = invoice_data.get('billingDetails', {})
739
+ charges = invoice_data.get('chargesSummary', {})
740
+
741
+ db_data = {
742
+ 'vendor_name': header.get('vendorName', ''),
743
+ 'invoice_number': billing.get('invoiceNumber', ''),
744
+ 'invoice_date': billing.get('invoiceDate', ''),
745
+ 'due_date': billing.get('dueDate', ''),
746
+ 'total_amount': charges.get('document_total', 0),
747
+ 'tax_amount': charges.get('secondary_tax', 0),
748
+ 'line_items': json.dumps(charges.get('lineItemsBreakdown', []))
749
+ }
750
+
751
+ print(f"\n💾 [AGENT 4] Prepared database record:")
752
+ print(f" vendor_name: '{db_data['vendor_name']}'")
753
+ print(f" invoice_number: '{db_data['invoice_number']}'")
754
+ print(f" invoice_date: '{db_data['invoice_date']}'")
755
+ print(f" total_amount: {db_data['total_amount']}")
756
+
757
+ # Check if data is in the old format
758
+ elif 'extracted_data' in invoice_data:
759
+ extracted = invoice_data['extracted_data']
760
+ if isinstance(extracted, dict):
761
+ if 'headerSection' in extracted:
762
+ # Nested new format
763
+ header = extracted.get('headerSection', {})
764
+ billing = extracted.get('billingDetails', {})
765
+ charges = extracted.get('chargesSummary', {})
766
+
767
+ db_data = {
768
+ 'vendor_name': header.get('vendorName', ''),
769
+ 'invoice_number': billing.get('invoiceNumber', ''),
770
+ 'invoice_date': billing.get('invoiceDate', ''),
771
+ 'due_date': billing.get('dueDate', ''),
772
+ 'total_amount': charges.get('document_total', 0),
773
+ 'tax_amount': charges.get('secondary_tax', 0),
774
+ 'line_items': json.dumps(charges.get('lineItemsBreakdown', []))
775
+ }
776
+ else:
777
+ # Old flat format
778
+ db_data = {
779
+ 'vendor_name': extracted.get('vendor_name', ''),
780
+ 'invoice_number': extracted.get('invoice_number', ''),
781
+ 'invoice_date': extracted.get('invoice_date', ''),
782
+ 'due_date': extracted.get('due_date', ''),
783
+ 'total_amount': extracted.get('amount', extracted.get('total_amount', 0)),
784
+ 'tax_amount': extracted.get('tax_amount', 0),
785
+ 'line_items': json.dumps(extracted.get('line_items', []))
786
+ }
787
+
788
+ # Update tool parameters with the transformed data
789
+ for tool_name, result in tool_results.items():
790
+ if tool_name == "PostgresInsert" and db_data:
791
+ # Inject the properly formatted data into the tool parameters
792
+ if 'parameters' not in result:
793
+ result['parameters'] = {}
794
+ result['parameters']['data'] = db_data
795
+ print(f"\n✅ [AGENT 4] Injected transformed data into PostgresInsert parameters")
796
+
797
+ # Call the original print function for debugging
798
+ print_database_data(agent, tool_results, invoice_data)
799
+
800
+ return tool_results
801
+
802
+ writer_agent = Agent(
803
+ role="Data Entry Specialist",
804
+ job="Write validated invoice data to database",
805
+ llm=default_llm,
806
+ sops=[
807
+ "Validate invoice data completeness",
808
+ "Map fields to database columns using schema",
809
+ "Print out the data being inserted into database",
810
+ "Connect to database",
811
+ "Insert record into invoices table",
812
+ "Return confirmation with record ID"
813
+ ],
814
+ systems=["Database"],
815
+ tools=[
816
+ {"name": "DataValidator", "hosted_by": "mcp"},
817
+ {"name": "PostgresInsert", "hosted_by": "mcp"}
818
+ ],
819
+ input_keys=["invoice_data", "invoice_schema"],
820
+ output_key="write_confirmation",
821
+ custom_processing=process_database_insertion
822
+ )
823
+
824
+ post_monitor_agent = create_simple_monitor_agent()
825
+ post_monitor_agent.role = "Post-ETL Database Monitor"
826
+
827
+ manager_agent = Agent(
828
+ role="ETL Process Manager",
829
+ job="Coordinate ETL pipeline and validate data integrity",
830
+ llm=manager_llm,
831
+ sops=[
832
+ "Review pre-ETL database state",
833
+ "Validate ETL process completion",
834
+ "Compare pre and post database states",
835
+ "Generate ETL summary report",
836
+ "Flag any data quality issues"
837
+ ],
838
+ allow_delegation=True,
839
+ output_key="etl_summary"
840
+ )
841
+
842
+ # Create ETL department
843
+ etl_department = Department(
844
+ name="ETL Invoice Processing",
845
+ mission="Complete end-to-end ETL process with comprehensive monitoring",
846
+ agents=[pre_monitor_agent, etl_agent, direct_vision_agent, writer_agent, post_monitor_agent],
847
+ manager_agent=manager_agent,
848
+ workflow_order=[
849
+ "Pre-ETL Database Monitor",
850
+ "Data Engineer",
851
+ "Invoice Parser",
852
+ "Data Entry Specialist",
853
+ "Post-ETL Database Monitor"
854
+ ],
855
+ dependencies=["Database", "InvoiceStore"],
856
+ execution_policy={
857
+ "retry_on_fail": True,
858
+ "max_retries": 2,
859
+ "halt_on_validation_error": True,
860
+ "timeout_seconds": 300
861
+ },
862
+ context={
863
+ "company_id": "acme_corp",
864
+ "fiscal_year": "2024",
865
+ "mcp_bridge_url": "http://localhost:8081",
866
+ "mcp_bridge_secret": "test-secret-for-development"
867
+ }
868
+ )
869
+
870
+ def upload_file_to_api(file_path: str, api_url: str = "https://api.memra.co", max_retries: int = 3) -> str:
871
+ """Upload a file to the remote API for vision-based PDF processing with retry logic"""
872
+
873
+ for attempt in range(max_retries + 1):
874
+ try:
875
+ print(f"📤 Uploading {os.path.basename(file_path)} to remote API (attempt {attempt + 1}/{max_retries + 1})")
876
+ print(f" File path: {file_path}")
877
+
878
+ # Read the file and encode as base64
879
+ with open(file_path, 'rb') as f:
880
+ file_content = f.read()
881
+
882
+ file_b64 = base64.b64encode(file_content).decode('utf-8')
883
+
884
+ # Prepare upload data
885
+ upload_data = {
886
+ "filename": os.path.basename(file_path),
887
+ "content": file_b64,
888
+ "content_type": "application/pdf"
889
+ }
890
+
891
+ # Upload to remote API
892
+ api_key = os.getenv("MEMRA_API_KEY")
893
+ response = requests.post(
894
+ f"{api_url}/upload",
895
+ json=upload_data,
896
+ headers={
897
+ "X-API-Key": api_key,
898
+ "Content-Type": "application/json"
899
+ },
900
+ timeout=PROCESSING_CONFIG["timeout_seconds"]
901
+ )
902
+
903
+ if response.status_code == 200:
904
+ result = response.json()
905
+ if result.get("success"):
906
+ remote_path = result["data"]["remote_path"]
907
+ print(f"✅ File uploaded successfully")
908
+ print(f" Remote path: {remote_path}")
909
+ return remote_path
910
+ else:
911
+ error_msg = result.get('error', 'Unknown error')
912
+ print(f"❌ Upload failed: {error_msg}")
913
+
914
+ # Check if it's a rate limit error
915
+ if "rate limit" in error_msg.lower() or "too many requests" in error_msg.lower():
916
+ delay = PROCESSING_CONFIG["rate_limit_delay"] * (2 ** attempt)
917
+ print(f"⏳ Rate limited, waiting {delay}s before retry...")
918
+ time.sleep(delay)
919
+ continue
920
+ elif response.status_code == 429: # Rate limited
921
+ delay = PROCESSING_CONFIG["rate_limit_delay"] * (2 ** attempt)
922
+ print(f"⏳ Rate limited (HTTP 429), waiting {delay}s before retry...")
923
+ time.sleep(delay)
924
+ continue
925
+ else:
926
+ print(f"❌ Upload request failed: {response.status_code}")
927
+ print(f" Response: {response.text}")
928
+
929
+ # Don't retry on client errors (4xx) except 429
930
+ if 400 <= response.status_code < 500 and response.status_code != 429:
931
+ break
932
+
933
+ except requests.exceptions.Timeout:
934
+ print(f"⏰ Upload timeout (attempt {attempt + 1})")
935
+ if attempt < max_retries:
936
+ delay = PROCESSING_CONFIG["retry_delay_base"] * (2 ** attempt)
937
+ print(f"⏳ Waiting {delay}s before retry...")
938
+ time.sleep(delay)
939
+ continue
940
+ except Exception as e:
941
+ print(f"⚠️ Upload error (attempt {attempt + 1}): {e}")
942
+ if attempt < max_retries:
943
+ delay = PROCESSING_CONFIG["retry_delay_base"] * (2 ** attempt)
944
+ print(f"⏳ Waiting {delay}s before retry...")
945
+ time.sleep(delay)
946
+ continue
947
+
948
+ print(f"❌ Failed to upload {os.path.basename(file_path)} after {max_retries + 1} attempts")
949
+ return file_path
950
+
951
+ def print_vision_model_data(agent, tool_results):
952
+ """Print out the JSON data returned by vision model tools"""
953
+ print(f"\n🔍 {agent.role}: VISION MODEL DATA ANALYSIS")
954
+ print("=" * 60)
955
+
956
+ for tool_name, result in tool_results.items():
957
+ print(f"\n📊 Tool: {tool_name}")
958
+ print(f"✅ Success: {result.get('success', 'Unknown')}")
959
+
960
+ if 'data' in result:
961
+ data = result['data']
962
+ print(f"📄 Data Structure:")
963
+ print(f" - Keys: {list(data.keys())}")
964
+
965
+ # Print extracted text if available
966
+ if 'extracted_text' in data:
967
+ text = data['extracted_text']
968
+ print(f"📝 Extracted Text ({len(text)} chars):")
969
+ print(f" {text[:200]}{'...' if len(text) > 200 else ''}")
970
+
971
+ # Print extracted data if available
972
+ if 'extracted_data' in data:
973
+ extracted = data['extracted_data']
974
+ print(f"🎯 Extracted Data:")
975
+
976
+ # Handle both old and new formats
977
+ if 'headerSection' in extracted:
978
+ # New format (converted)
979
+ header = extracted.get('headerSection', {})
980
+ billing = extracted.get('billingDetails', {})
981
+ charges = extracted.get('chargesSummary', {})
982
+ print(f" Vendor: {header.get('vendorName', 'N/A')}")
983
+ print(f" Invoice #: {billing.get('invoiceNumber', 'N/A')}")
984
+ print(f" Date: {billing.get('invoiceDate', 'N/A')}")
985
+ print(f" Amount: ${charges.get('document_total', 'N/A')}")
986
+ print(f" Tax: ${charges.get('secondary_tax', 'N/A')}")
987
+ print(f" Line Items: {len(charges.get('lineItemsBreakdown', []))} items")
988
+ else:
989
+ # Old format (direct)
990
+ print(f" Vendor: {extracted.get('vendor_name', 'N/A')}")
991
+ print(f" Invoice #: {extracted.get('invoice_number', 'N/A')}")
992
+ print(f" Date: {extracted.get('invoice_date', 'N/A')}")
993
+ print(f" Amount: ${extracted.get('amount', 'N/A')}")
994
+ print(f" Tax: ${extracted.get('tax_amount', 'N/A')}")
995
+ print(f" Line Items: {extracted.get('line_items', 'N/A')}")
996
+
997
+ # Print screenshot info if available
998
+ if 'screenshots_dir' in data:
999
+ print(f"📸 Screenshots:")
1000
+ print(f" Directory: {data.get('screenshots_dir', 'N/A')}")
1001
+ print(f" Count: {data.get('screenshot_count', 'N/A')}")
1002
+ print(f" Invoice ID: {data.get('invoice_id', 'N/A')}")
1003
+
1004
+ if 'error' in result:
1005
+ print(f"❌ Error: {result['error']}")
1006
+
1007
+ print("=" * 60)
1008
+
1009
+ def print_database_data(agent, tool_results, invoice_data):
1010
+ """Print out the data being inserted into database"""
1011
+ print(f"\n💾 {agent.role}: DATABASE INSERTION DATA")
1012
+ print("=" * 60)
1013
+
1014
+ if invoice_data:
1015
+ print(f"📊 Invoice Data to Insert:")
1016
+ if isinstance(invoice_data, dict) and 'extracted_data' in invoice_data:
1017
+ data = invoice_data['extracted_data']
1018
+ print(f" Vendor: '{data.get('vendor_name', '')}'")
1019
+ print(f" Invoice #: '{data.get('invoice_number', '')}'")
1020
+ print(f" Date: '{data.get('invoice_date', '')}'")
1021
+ print(f" Amount: {data.get('amount', 0)}")
1022
+ print(f" Tax: {data.get('tax_amount', 0)}")
1023
+ print(f" Line Items: '{data.get('line_items', '')}'")
1024
+ else:
1025
+ print(f" Raw data: {invoice_data}")
1026
+
1027
+ for tool_name, result in tool_results.items():
1028
+ print(f"\n🔧 Tool: {tool_name}")
1029
+ print(f"✅ Success: {result.get('success', 'Unknown')}")
1030
+
1031
+ if 'data' in result:
1032
+ data = result['data']
1033
+ print(f"📄 Result Data:")
1034
+ for key, value in data.items():
1035
+ print(f" {key}: {value}")
1036
+
1037
+ if 'error' in result:
1038
+ print(f"❌ Error: {result['error']}")
1039
+
1040
+ print("=" * 60)
1041
+
1042
+ def switch_to_remote_api_for_pdf():
1043
+ """Temporarily switch to remote API for PDF processing"""
1044
+ original_url = os.environ.get("MEMRA_API_URL")
1045
+ os.environ["MEMRA_API_URL"] = REMOTE_API_URL
1046
+ return original_url
1047
+
1048
+ def restore_api_url(original_url):
1049
+ """Restore the original API URL"""
1050
+ if original_url:
1051
+ os.environ["MEMRA_API_URL"] = original_url
1052
+
1053
+ def validate_agent_configuration(department):
1054
+ """Validate that critical agents have required tools configured"""
1055
+ critical_agents = {
1056
+ "Invoice Parser": ["PDFProcessor"],
1057
+ "Data Entry Specialist": ["DataValidator", "PostgresInsert"],
1058
+ "Data Engineer": ["SQLExecutor"]
1059
+ }
1060
+
1061
+ for agent in department.agents:
1062
+ if agent.role in critical_agents:
1063
+ # Skip validation if agent has custom processing function
1064
+ if hasattr(agent, 'custom_processing') and agent.custom_processing is not None:
1065
+ print(f"ℹ️ {agent.role} uses custom processing (tools validation skipped)")
1066
+ continue
1067
+
1068
+ required_tools = critical_agents[agent.role]
1069
+ # Handle both Tool objects and dictionaries
1070
+ configured_tools = []
1071
+ for tool in agent.tools:
1072
+ if isinstance(tool, dict):
1073
+ configured_tools.append(tool["name"])
1074
+ else:
1075
+ configured_tools.append(tool.name)
1076
+
1077
+ missing_tools = [tool for tool in required_tools if tool not in configured_tools]
1078
+ if missing_tools:
1079
+ print(f"⚠️ WARNING: {agent.role} is missing critical tools: {missing_tools}")
1080
+ print(f" Configured tools: {configured_tools}")
1081
+ return False
1082
+
1083
+ return True
1084
+
1085
+ def main():
1086
+ """Run the ETL demo workflow with robust processing"""
1087
+ print("\n🚀 Starting ETL Invoice Processing Demo...")
1088
+ print("📊 This demo includes comprehensive database monitoring")
1089
+ print("📡 Tools will execute on Memra API server")
1090
+ print("📝 Processing 15 specific invoice files with robust error handling")
1091
+ print("⏱️ Includes delays between files and retry logic for API resilience")
1092
+ print("🎯 Target files:", ", ".join(TARGET_FILES))
1093
+
1094
+ # Configuration
1095
+ config = {
1096
+ "table_name": os.getenv("MEMRA_TABLE_NAME", "invoices"),
1097
+ "data_directory": os.getenv("MEMRA_DATA_DIR", "data/invoices"),
1098
+ "company_id": os.getenv("MEMRA_COMPANY_ID", "acme_corp"),
1099
+ "fiscal_year": os.getenv("MEMRA_FISCAL_YEAR", "2024"),
1100
+ "database_url": os.getenv("MEMRA_DATABASE_URL", "postgresql://memra:memra123@localhost:5432/memra_invoice_db")
1101
+ }
1102
+
1103
+ # Generate schema query dynamically
1104
+ schema_query = f"SELECT column_name, data_type, is_nullable, column_default FROM information_schema.columns WHERE table_name = '{config['table_name']}' ORDER BY ordinal_position"
1105
+
1106
+ # Validate agent configuration before proceeding
1107
+ if not validate_agent_configuration(etl_department):
1108
+ print("❌ Critical agents are missing required tools!")
1109
+ print("⚠️ Please fix agent configuration before running ETL process")
1110
+ sys.exit(1)
1111
+
1112
+ engine = ExecutionEngine()
1113
+
1114
+ # Use configurable data directory
1115
+ current_dir = os.path.dirname(os.path.abspath(__file__))
1116
+ data_dir = os.path.join(current_dir, config["data_directory"])
1117
+
1118
+ # Find only the target files
1119
+ invoice_files = []
1120
+ missing_files = []
1121
+
1122
+ for target_file in TARGET_FILES:
1123
+ file_path = os.path.join(data_dir, target_file)
1124
+ if os.path.exists(file_path):
1125
+ invoice_files.append(file_path)
1126
+ else:
1127
+ missing_files.append(target_file)
1128
+
1129
+ if missing_files:
1130
+ print(f"⚠️ Missing files: {', '.join(missing_files)}")
1131
+
1132
+ if not invoice_files:
1133
+ print(f"❌ No target files found in {config['data_directory']}/ directory")
1134
+ print("📝 Available files:")
1135
+ available_files = glob.glob(os.path.join(data_dir, "*.PDF"))
1136
+ for file in available_files[:10]: # Show first 10
1137
+ print(f" - {os.path.basename(file)}")
1138
+ if len(available_files) > 10:
1139
+ print(f" ... and {len(available_files) - 10} more")
1140
+ sys.exit(1)
1141
+
1142
+ print(f"\n📁 Found {len(invoice_files)} target files to process")
1143
+ print(f"⏱️ Estimated processing time: {len(invoice_files) * PROCESSING_CONFIG['delay_between_files']:.1f} seconds (plus processing time)")
1144
+
1145
+ # Process files with robust error handling
1146
+ successful_processing = 0
1147
+ failed_processing = 0
1148
+ skipped_processing = 0
1149
+
1150
+ for idx, invoice_file in enumerate(invoice_files):
1151
+ filename = os.path.basename(invoice_file)
1152
+ print(f"\n{'='*60}")
1153
+ print(f"📄 Processing file {idx + 1}/{len(invoice_files)}: {filename}")
1154
+ print(f"{'='*60}")
1155
+
1156
+ # Add delay between files (except for the first one)
1157
+ if idx > 0:
1158
+ delay = PROCESSING_CONFIG["delay_between_files"] + random.uniform(0, 1) # Add some randomness
1159
+ print(f"⏳ Waiting {delay:.1f}s between files...")
1160
+ time.sleep(delay)
1161
+
1162
+ try:
1163
+ # Upload file with retry logic
1164
+ remote_file_path = upload_file_to_api(invoice_file, max_retries=PROCESSING_CONFIG["max_retries"])
1165
+
1166
+ if remote_file_path == invoice_file:
1167
+ print(f"❌ Failed to upload {filename}, skipping...")
1168
+ failed_processing += 1
1169
+ continue
1170
+
1171
+ # Run the full ETL workflow with configurable parameters
1172
+ input_data = {
1173
+ "file": remote_file_path,
1174
+ "connection": config["database_url"],
1175
+ "table_name": config["table_name"],
1176
+ "sql_query": schema_query
1177
+ }
1178
+
1179
+ result = engine.execute_department(etl_department, input_data)
1180
+
1181
+ if result.success:
1182
+ successful_processing += 1
1183
+ print(f"\n✅ Successfully processed: {filename}")
1184
+
1185
+ # Show summary if available
1186
+ if 'etl_summary' in result.data:
1187
+ summary = result.data['etl_summary']
1188
+ print(f"📋 Status: {summary.get('status', 'success')}")
1189
+ if 'write_confirmation' in result.data:
1190
+ write_conf = result.data['write_confirmation']
1191
+ if isinstance(write_conf, dict) and 'record_id' in write_conf:
1192
+ print(f"💾 Database Record ID: {write_conf['record_id']}")
1193
+ else:
1194
+ failed_processing += 1
1195
+ print(f"\n❌ Failed to process: {filename}")
1196
+ print(f" Error: {result.error}")
1197
+ if result.trace and result.trace.errors:
1198
+ print(" Details:")
1199
+ for error in result.trace.errors:
1200
+ print(f" - {error}")
1201
+
1202
+ except Exception as e:
1203
+ failed_processing += 1
1204
+ print(f"\n💥 Unexpected error processing {filename}: {e}")
1205
+ print(" Continuing with next file...")
1206
+ continue
1207
+
1208
+ # Final summary
1209
+ print(f"\n{'='*60}")
1210
+ print(f"🎯 ETL DEMO COMPLETED")
1211
+ print(f"{'='*60}")
1212
+ print(f"📊 Processing Summary:")
1213
+ print(f" ✅ Successful: {successful_processing}")
1214
+ print(f" ❌ Failed: {failed_processing}")
1215
+ print(f" ⏭️ Skipped: {skipped_processing}")
1216
+ print(f" 📄 Total: {len(invoice_files)}")
1217
+
1218
+ if successful_processing > 0:
1219
+ print(f"\n🎉 Demo completed successfully!")
1220
+ print(f" Processed {successful_processing} invoices with robust error handling")
1221
+ print(f" This demonstrates real-world API resilience and rate limiting")
1222
+ else:
1223
+ print(f"\n⚠️ No files were processed successfully")
1224
+ print(f" Check API connectivity and file availability")
1225
+
1226
+ print(f"\n💡 This demo shows realistic production scenarios:")
1227
+ print(f" - API rate limiting and retry logic")
1228
+ print(f" - Graceful error handling and file skipping")
1229
+ print(f" - Delays between requests to avoid overwhelming APIs")
1230
+ print(f" - Exponential backoff for failed requests")
1231
+
1232
+ if __name__ == "__main__":
1233
+ main()