memra 0.2.11__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memra/__init__.py +1 -1
- memra/cli.py +129 -9
- memra/demos/etl_invoice_processing/check_after_workflow.py +50 -0
- memra/demos/etl_invoice_processing/check_database.py +44 -0
- memra/demos/etl_invoice_processing/check_recent_db.py +42 -0
- memra/demos/etl_invoice_processing/data/README.md +112 -0
- memra/demos/etl_invoice_processing/data/invoices/10352259401.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352259823.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352260169.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352260417.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352260599.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352260912.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352261134.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352261563.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352261647.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352261720.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352261811.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352262025.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352262454.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352262702.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352262884.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352263346.PDF +0 -0
- memra/demos/etl_invoice_processing/data/invoices/10352263429.PDF +0 -0
- memra/demos/etl_invoice_processing/database_monitor_agent.py +89 -0
- memra/demos/etl_invoice_processing/debug_mcp.py +66 -0
- memra/demos/etl_invoice_processing/debug_schema.py +45 -0
- memra/demos/etl_invoice_processing/etl_invoice_demo.py +1233 -0
- memra/demos/etl_invoice_processing/modify_database.py +65 -0
- memra/demos/etl_invoice_processing/run_etl_batch.py +60 -0
- memra/demos/etl_invoice_processing/setup_demo_data.py +154 -0
- memra/demos/etl_invoice_processing/simple_pdf_processor.py +181 -0
- memra/demos/etl_invoice_processing/test_agent3.py +56 -0
- memra/demos/etl_invoice_processing/test_agent3_v2.py +32 -0
- memra/demos/etl_invoice_processing/test_api.py +28 -0
- memra/demos/etl_invoice_processing/test_api_client_direct.py +89 -0
- memra/demos/etl_invoice_processing/test_conversion.py +172 -0
- memra/demos/etl_invoice_processing/test_debug.py +41 -0
- memra/demos/etl_invoice_processing/test_direct_vision.py +114 -0
- memra/demos/etl_invoice_processing/test_full_response.py +22 -0
- memra/demos/etl_invoice_processing/test_memra_response.py +124 -0
- memra/demos/etl_invoice_processing/test_pdf_processor_response.py +118 -0
- memra/demos/etl_invoice_processing/test_pdfprocessor_direct.py +96 -0
- memra/demos/etl_invoice_processing/test_postgres_insert.py +120 -0
- memra/demos/etl_invoice_processing/test_remote_upload.py +143 -0
- memra/demos/etl_invoice_processing/test_schema_format.py +39 -0
- memra/demos/etl_invoice_processing/test_sql_executor.py +58 -0
- memra/demos/etl_invoice_processing/test_sql_executor_extra_fields.py +61 -0
- memra/demos/etl_invoice_processing/test_sql_executor_fix.py +40 -0
- memra/demos/etl_invoice_processing/test_updated_server.py +50 -0
- memra/demos/etl_invoice_processing/test_upload_functionality.py +156 -0
- memra/demos/etl_invoice_processing/test_upload_server.py +232 -0
- memra/demos/etl_invoice_processing/test_vision_output.py +75 -0
- memra/demos/etl_invoice_processing/test_vision_prompt.py +43 -0
- memra/demos/etl_invoice_processing/test_vision_simple.py +60 -0
- {memra-0.2.11.dist-info → memra-0.2.13.dist-info}/METADATA +53 -78
- memra-0.2.13.dist-info/RECORD +120 -0
- {memra-0.2.11.dist-info → memra-0.2.13.dist-info}/WHEEL +1 -1
- memra-0.2.11.dist-info/RECORD +0 -68
- {memra-0.2.11.dist-info/licenses → memra-0.2.13.dist-info}/LICENSE +0 -0
- {memra-0.2.11.dist-info → memra-0.2.13.dist-info}/entry_points.txt +0 -0
- {memra-0.2.11.dist-info → memra-0.2.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1233 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
ETL Invoice Processing Demo
|
4
|
+
Complete ETL workflow with database monitoring before and after
|
5
|
+
"""
|
6
|
+
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
import time
|
10
|
+
import random
|
11
|
+
from pathlib import Path
|
12
|
+
from memra import Agent, Department, LLM, check_api_health, get_api_status
|
13
|
+
from memra.execution import ExecutionEngine, ExecutionTrace
|
14
|
+
from database_monitor_agent import create_simple_monitor_agent, get_monitoring_queries
|
15
|
+
import glob
|
16
|
+
import requests
|
17
|
+
import base64
|
18
|
+
import json
|
19
|
+
|
20
|
+
# Set API key for authentication
|
21
|
+
os.environ['MEMRA_API_KEY'] = 'test-secret-for-development'
|
22
|
+
os.environ['MEMRA_API_URL'] = 'https://api.memra.co'
|
23
|
+
|
24
|
+
# Add the parent directory to the path so we can import memra
|
25
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
26
|
+
|
27
|
+
# Check for required API key
|
28
|
+
if not os.getenv("MEMRA_API_KEY"):
|
29
|
+
print("❌ Error: MEMRA_API_KEY environment variable is required")
|
30
|
+
print("Please set your API key: export MEMRA_API_KEY='test-secret-for-development'")
|
31
|
+
print("Using local MCP bridge server")
|
32
|
+
sys.exit(1)
|
33
|
+
|
34
|
+
# Set API configuration - using remote API for all operations including PDF processing
|
35
|
+
os.environ["MEMRA_API_URL"] = "https://api.memra.co"
|
36
|
+
|
37
|
+
# Store the remote API URL for PDF processing
|
38
|
+
REMOTE_API_URL = "https://api.memra.co"
|
39
|
+
|
40
|
+
# Define the specific 15 files to process
|
41
|
+
TARGET_FILES = [
|
42
|
+
"10352259401.PDF",
|
43
|
+
"10352259823.PDF",
|
44
|
+
"10352260169.PDF",
|
45
|
+
"10352260417.PDF",
|
46
|
+
"10352260599.PDF",
|
47
|
+
"10352260912.PDF",
|
48
|
+
"10352261134.PDF",
|
49
|
+
"10352261563.PDF",
|
50
|
+
"10352261647.PDF",
|
51
|
+
"10352261720.PDF",
|
52
|
+
"10352261811.PDF",
|
53
|
+
"10352262025.PDF",
|
54
|
+
"10352262454.PDF",
|
55
|
+
"10352262702.PDF",
|
56
|
+
"10352262884.PDF"
|
57
|
+
]
|
58
|
+
|
59
|
+
# Configuration for robust processing
|
60
|
+
PROCESSING_CONFIG = {
|
61
|
+
"delay_between_files": 2.5, # seconds
|
62
|
+
"max_retries": 3,
|
63
|
+
"retry_delay_base": 2, # seconds
|
64
|
+
"retry_delay_max": 30, # seconds
|
65
|
+
"timeout_seconds": 120,
|
66
|
+
"rate_limit_delay": 5 # additional delay if rate limited
|
67
|
+
}
|
68
|
+
|
69
|
+
# Check API health before starting
|
70
|
+
print("🔍 Checking Memra API status...")
|
71
|
+
api_status = get_api_status()
|
72
|
+
print(f"API Health: {'✅ Healthy' if api_status['api_healthy'] else '❌ Unavailable'}")
|
73
|
+
print(f"API URL: {api_status['api_url']}")
|
74
|
+
print(f"Tools Available: {api_status['tools_available']}")
|
75
|
+
|
76
|
+
if not api_status['api_healthy']:
|
77
|
+
print("❌ Cannot proceed - Memra API is not available")
|
78
|
+
sys.exit(1)
|
79
|
+
|
80
|
+
# Define LLMs
|
81
|
+
default_llm = LLM(
|
82
|
+
model="llama-3.2-11b-vision-preview",
|
83
|
+
temperature=0.1,
|
84
|
+
max_tokens=2000
|
85
|
+
)
|
86
|
+
|
87
|
+
parsing_llm = LLM(
|
88
|
+
model="llama-3.2-11b-vision-preview",
|
89
|
+
temperature=0.0,
|
90
|
+
max_tokens=4000
|
91
|
+
)
|
92
|
+
|
93
|
+
manager_llm = LLM(
|
94
|
+
model="llama-3.2-11b-vision-preview",
|
95
|
+
temperature=0.2,
|
96
|
+
max_tokens=1500
|
97
|
+
)
|
98
|
+
|
99
|
+
# Define agents
|
100
|
+
pre_monitor_agent = create_simple_monitor_agent()
|
101
|
+
pre_monitor_agent.role = "Pre-ETL Database Monitor"
|
102
|
+
|
103
|
+
etl_agent = Agent(
|
104
|
+
role="Data Engineer",
|
105
|
+
job="Extract invoice schema from database",
|
106
|
+
llm=default_llm,
|
107
|
+
sops=[
|
108
|
+
"Connect to database using provided connection string",
|
109
|
+
"Generate SQL query: SELECT column_name, data_type, is_nullable, column_default FROM information_schema.columns WHERE table_name = 'invoices' ORDER BY ordinal_position",
|
110
|
+
"Execute the generated SQL query using SQLExecutor tool",
|
111
|
+
"Extract column names, types, and constraints from results",
|
112
|
+
"Return schema as structured JSON with column information"
|
113
|
+
],
|
114
|
+
systems=["Database"],
|
115
|
+
tools=[
|
116
|
+
{"name": "SQLExecutor", "hosted_by": "mcp", "input_keys": ["sql_query"]}
|
117
|
+
],
|
118
|
+
input_keys=["connection", "table_name", "sql_query"],
|
119
|
+
output_key="invoice_schema"
|
120
|
+
)
|
121
|
+
|
122
|
+
def convert_vision_response_to_extracted_data(vision_response: str) -> dict:
|
123
|
+
"""Convert vision model response to extracted_data format"""
|
124
|
+
try:
|
125
|
+
# Clean up the response - remove markdown code blocks if present
|
126
|
+
if vision_response.startswith("```json"):
|
127
|
+
vision_response = vision_response.replace("```json", "").replace("```", "").strip()
|
128
|
+
|
129
|
+
# Parse the JSON response
|
130
|
+
data = json.loads(vision_response)
|
131
|
+
|
132
|
+
# Extract fields with fallback to different naming conventions
|
133
|
+
invoice_number = (
|
134
|
+
data.get("invoice_number") or
|
135
|
+
data.get("InvoiceNumber") or
|
136
|
+
data.get("invoiceNumber") or
|
137
|
+
""
|
138
|
+
)
|
139
|
+
|
140
|
+
invoice_date = (
|
141
|
+
data.get("invoice_date") or
|
142
|
+
data.get("InvoiceDate") or
|
143
|
+
data.get("invoiceDate") or
|
144
|
+
""
|
145
|
+
)
|
146
|
+
|
147
|
+
# Convert date format if needed
|
148
|
+
if invoice_date:
|
149
|
+
if "/" in invoice_date and len(invoice_date.split("/")) == 3:
|
150
|
+
parts = invoice_date.split("/")
|
151
|
+
month, day, year = parts[0], parts[1], parts[2]
|
152
|
+
if len(year) == 2:
|
153
|
+
year = "20" + year
|
154
|
+
invoice_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
|
155
|
+
|
156
|
+
amount = (
|
157
|
+
data.get("total_amount") or # Add this - matches vision model output
|
158
|
+
data.get("amount") or
|
159
|
+
data.get("InvoiceTotal") or
|
160
|
+
data.get("invoiceTotal") or
|
161
|
+
data.get("total") or
|
162
|
+
0
|
163
|
+
)
|
164
|
+
|
165
|
+
vendor_name = (
|
166
|
+
data.get("vendor_name") or
|
167
|
+
data.get("VendorName") or
|
168
|
+
data.get("vendorName") or
|
169
|
+
data.get("Company") or
|
170
|
+
data.get("company") or
|
171
|
+
data.get("Vendor") or
|
172
|
+
data.get("vendor") or
|
173
|
+
""
|
174
|
+
)
|
175
|
+
|
176
|
+
# If vendor not found, try to infer from the data
|
177
|
+
if not vendor_name:
|
178
|
+
# Check if items mention specific vendors
|
179
|
+
items = data.get("Items") or data.get("Order") or data.get("items") or []
|
180
|
+
for item in items:
|
181
|
+
desc = item.get("Description", "").upper()
|
182
|
+
if "PROPANE" in desc:
|
183
|
+
vendor_name = "Superior Propane"
|
184
|
+
break
|
185
|
+
|
186
|
+
tax_amount = (
|
187
|
+
data.get("tax_amount") or
|
188
|
+
data.get("TaxAmount") or
|
189
|
+
data.get("taxAmount") or
|
190
|
+
0
|
191
|
+
)
|
192
|
+
|
193
|
+
due_date = (
|
194
|
+
data.get("due_date") or
|
195
|
+
data.get("DueDate") or
|
196
|
+
data.get("dueDate") or
|
197
|
+
""
|
198
|
+
)
|
199
|
+
|
200
|
+
line_items = (
|
201
|
+
data.get("line_items") or
|
202
|
+
data.get("Order") or
|
203
|
+
data.get("order") or
|
204
|
+
data.get("LineItems") or
|
205
|
+
data.get("lineItems") or
|
206
|
+
[]
|
207
|
+
)
|
208
|
+
|
209
|
+
# Convert to expected format
|
210
|
+
extracted_data = {
|
211
|
+
"headerSection": {
|
212
|
+
"vendorName": vendor_name,
|
213
|
+
"subtotal": float(amount)
|
214
|
+
},
|
215
|
+
"billingDetails": {
|
216
|
+
"invoiceNumber": invoice_number,
|
217
|
+
"invoiceDate": invoice_date,
|
218
|
+
"dueDate": due_date
|
219
|
+
},
|
220
|
+
"chargesSummary": {
|
221
|
+
"document_total": float(amount),
|
222
|
+
"secondary_tax": float(tax_amount),
|
223
|
+
"lineItemsBreakdown": line_items
|
224
|
+
},
|
225
|
+
"status": "processed"
|
226
|
+
}
|
227
|
+
|
228
|
+
return extracted_data
|
229
|
+
|
230
|
+
except Exception as e:
|
231
|
+
print(f"⚠️ Error converting vision response: {e}")
|
232
|
+
return {
|
233
|
+
"headerSection": {"vendorName": "", "subtotal": 0.0},
|
234
|
+
"billingDetails": {"invoiceNumber": "", "invoiceDate": "", "dueDate": ""},
|
235
|
+
"chargesSummary": {"document_total": 0.0, "secondary_tax": 0.0, "lineItemsBreakdown": []},
|
236
|
+
"status": "conversion_error"
|
237
|
+
}
|
238
|
+
|
239
|
+
def pdf_processing_with_remote_api(agent, tool_results, **kwargs):
|
240
|
+
"""Custom processing function that switches to remote API for PDF processing"""
|
241
|
+
print("\n[DEBUG] pdf_processing_with_remote_api function called!")
|
242
|
+
print(f"[DEBUG] Agent: {agent.role}")
|
243
|
+
print(f"[DEBUG] Tool results keys: {list(tool_results.keys())}")
|
244
|
+
import json
|
245
|
+
original_url = switch_to_remote_api_for_pdf()
|
246
|
+
try:
|
247
|
+
for tool_name, result_data in tool_results.items():
|
248
|
+
if tool_name == "PDFProcessor":
|
249
|
+
print("\n[DEBUG] Full PDFProcessor result_data:")
|
250
|
+
try:
|
251
|
+
print(json.dumps(result_data, indent=2, default=str))
|
252
|
+
except Exception as e:
|
253
|
+
print(f"[DEBUG] Could not serialize result_data: {e}")
|
254
|
+
print(result_data)
|
255
|
+
if tool_name == "PDFProcessor" and result_data.get("success"):
|
256
|
+
data = result_data.get("data", {})
|
257
|
+
# Double-nested
|
258
|
+
if "data" in data and isinstance(data["data"], dict):
|
259
|
+
inner_data = data["data"]
|
260
|
+
if "data" in inner_data and isinstance(inner_data["data"], dict):
|
261
|
+
actual_data = inner_data["data"]
|
262
|
+
extracted = actual_data.get("extracted_data", {})
|
263
|
+
vision_response = actual_data.get("vision_response")
|
264
|
+
if vision_response and (not extracted or not extracted.get("headerSection")):
|
265
|
+
converted_data = convert_vision_response_to_extracted_data(vision_response)
|
266
|
+
actual_data["extracted_data"] = converted_data
|
267
|
+
print(f"\n🔄 [PATCHED] Applied field mapping conversion to {tool_name} (double-nested)")
|
268
|
+
print(f" Invoice #: {converted_data['billingDetails']['invoiceNumber']}")
|
269
|
+
print(f" Amount: ${converted_data['chargesSummary']['document_total']}")
|
270
|
+
# Always print the raw JSON response
|
271
|
+
if vision_response:
|
272
|
+
print("\n📝 [AGENT 3] Vision Model Raw JSON Response:")
|
273
|
+
try:
|
274
|
+
parsed = json.loads(vision_response.replace('```json','').replace('```','').strip())
|
275
|
+
print(json.dumps(parsed, indent=2))
|
276
|
+
except Exception:
|
277
|
+
print(vision_response)
|
278
|
+
else:
|
279
|
+
extracted = inner_data.get("extracted_data", {})
|
280
|
+
vision_response = inner_data.get("vision_response")
|
281
|
+
if vision_response and (not extracted or not extracted.get("headerSection")):
|
282
|
+
converted_data = convert_vision_response_to_extracted_data(vision_response)
|
283
|
+
inner_data["extracted_data"] = converted_data
|
284
|
+
print(f"\n🔄 [PATCHED] Applied field mapping conversion to {tool_name} (single-nested)")
|
285
|
+
print(f" Invoice #: {converted_data['billingDetails']['invoiceNumber']}")
|
286
|
+
print(f" Amount: ${converted_data['chargesSummary']['document_total']}")
|
287
|
+
if vision_response:
|
288
|
+
print("\n📝 [AGENT 3] Vision Model Raw JSON Response:")
|
289
|
+
try:
|
290
|
+
parsed = json.loads(vision_response.replace('```json','').replace('```','').strip())
|
291
|
+
print(json.dumps(parsed, indent=2))
|
292
|
+
except Exception:
|
293
|
+
print(vision_response)
|
294
|
+
else:
|
295
|
+
extracted = data.get("extracted_data", {})
|
296
|
+
vision_response = data.get("vision_response")
|
297
|
+
if vision_response and (not extracted or not extracted.get("headerSection")):
|
298
|
+
converted_data = convert_vision_response_to_extracted_data(vision_response)
|
299
|
+
data["extracted_data"] = converted_data
|
300
|
+
print(f"\n🔄 [PATCHED] Applied field mapping conversion to {tool_name} (direct)")
|
301
|
+
print(f" Invoice #: {converted_data['billingDetails']['invoiceNumber']}")
|
302
|
+
print(f" Amount: ${converted_data['chargesSummary']['document_total']}")
|
303
|
+
if vision_response:
|
304
|
+
print("\n📝 [AGENT 3] Vision Model Raw JSON Response:")
|
305
|
+
try:
|
306
|
+
parsed = json.loads(vision_response.replace('```json','').replace('```','').strip())
|
307
|
+
print(json.dumps(parsed, indent=2))
|
308
|
+
except Exception:
|
309
|
+
print(vision_response)
|
310
|
+
print_vision_model_data(agent, tool_results)
|
311
|
+
return tool_results
|
312
|
+
finally:
|
313
|
+
restore_api_url(original_url)
|
314
|
+
|
315
|
+
def fix_pdfprocessor_response(agent, result_data, **kwargs):
|
316
|
+
"""Custom processing function that calls remote API for PDF processing and prints JSON"""
|
317
|
+
print(f"\n[DEBUG] fix_pdfprocessor_response called for {agent.role}")
|
318
|
+
print(f"[DEBUG] Result data type: {type(result_data)}")
|
319
|
+
print(f"[DEBUG] Result data: {result_data}")
|
320
|
+
|
321
|
+
# Get the file path from the result_data (the execution engine passes input data here)
|
322
|
+
file_path = result_data.get('file', '')
|
323
|
+
print(f"[DEBUG] File path from result_data: {file_path}")
|
324
|
+
|
325
|
+
if not file_path:
|
326
|
+
print("❌ No file path provided in result_data")
|
327
|
+
print(f"[DEBUG] Available keys in result_data: {list(result_data.keys())}")
|
328
|
+
return result_data
|
329
|
+
|
330
|
+
try:
|
331
|
+
import requests
|
332
|
+
import json
|
333
|
+
import os
|
334
|
+
import base64
|
335
|
+
|
336
|
+
# Use the remote API for PDF processing
|
337
|
+
api_url = "https://api.memra.co"
|
338
|
+
api_key = os.getenv("MEMRA_API_KEY", "test-secret-for-development")
|
339
|
+
|
340
|
+
# Since the file is already uploaded and we have the remote path, use it directly
|
341
|
+
print(f"🔍 Using remote file path: {file_path}")
|
342
|
+
|
343
|
+
# Call the PDFProcessor with the remote path
|
344
|
+
print(f"🔍 Calling PDFProcessor with remote path...")
|
345
|
+
|
346
|
+
pdf_data = {
|
347
|
+
"file_path": file_path
|
348
|
+
}
|
349
|
+
|
350
|
+
response = requests.post(
|
351
|
+
f"{api_url}/tools/execute",
|
352
|
+
json={
|
353
|
+
"tool_name": "PDFProcessor",
|
354
|
+
"parameters": pdf_data
|
355
|
+
},
|
356
|
+
headers={
|
357
|
+
"X-API-Key": api_key,
|
358
|
+
"Content-Type": "application/json"
|
359
|
+
}
|
360
|
+
)
|
361
|
+
|
362
|
+
if response.status_code != 200:
|
363
|
+
print(f"❌ PDFProcessor call failed: {response.status_code}")
|
364
|
+
print(f" Response: {response.text}")
|
365
|
+
return result_data
|
366
|
+
|
367
|
+
pdf_result = response.json()
|
368
|
+
print(f"\n🎯 AGENT 3 - FULL PDFPROCESSOR RESPONSE:")
|
369
|
+
print("=" * 60)
|
370
|
+
print(json.dumps(pdf_result, indent=2, default=str))
|
371
|
+
print("=" * 60)
|
372
|
+
|
373
|
+
# Extract the vision response from the nested structure
|
374
|
+
vision_response = None
|
375
|
+
if pdf_result.get("success") and "data" in pdf_result:
|
376
|
+
data = pdf_result["data"]
|
377
|
+
|
378
|
+
# Check for nested data structure
|
379
|
+
if isinstance(data, dict) and "data" in data:
|
380
|
+
actual_data = data["data"]
|
381
|
+
if "vision_response" in actual_data:
|
382
|
+
vision_response = actual_data["vision_response"]
|
383
|
+
elif "vision_response" in data:
|
384
|
+
vision_response = data["vision_response"]
|
385
|
+
|
386
|
+
if vision_response:
|
387
|
+
print(f"\n🎯 AGENT 3 - RAW VISION MODEL JSON:")
|
388
|
+
print("=" * 60)
|
389
|
+
print(vision_response)
|
390
|
+
print("=" * 60)
|
391
|
+
|
392
|
+
# Try to parse the JSON response
|
393
|
+
try:
|
394
|
+
# Clean up the response - remove markdown code blocks if present
|
395
|
+
cleaned_response = vision_response
|
396
|
+
if cleaned_response.startswith("```json"):
|
397
|
+
cleaned_response = cleaned_response.replace("```json", "").replace("```", "").strip()
|
398
|
+
elif cleaned_response.startswith("```"):
|
399
|
+
cleaned_response = cleaned_response.replace("```", "").strip()
|
400
|
+
|
401
|
+
parsed_data = json.loads(cleaned_response)
|
402
|
+
print(f"\n✅ [AGENT 3] Successfully parsed JSON:")
|
403
|
+
print(json.dumps(parsed_data, indent=2))
|
404
|
+
|
405
|
+
# Convert to the expected format
|
406
|
+
extracted_data = convert_vision_response_to_extracted_data(cleaned_response)
|
407
|
+
|
408
|
+
# Debug vendor extraction
|
409
|
+
print(f"\n🔍 [AGENT 3] Extracted vendor: '{extracted_data['headerSection']['vendorName']}'")
|
410
|
+
print(f" Invoice #: {extracted_data['billingDetails']['invoiceNumber']}")
|
411
|
+
print(f" Amount: ${extracted_data['chargesSummary']['document_total']}")
|
412
|
+
|
413
|
+
# Update the result_data
|
414
|
+
result_data = {
|
415
|
+
"success": True,
|
416
|
+
"data": {
|
417
|
+
"vision_response": vision_response,
|
418
|
+
"extracted_data": extracted_data
|
419
|
+
},
|
420
|
+
"_memra_metadata": {
|
421
|
+
"agent_role": agent.role,
|
422
|
+
"tools_real_work": ["PDFProcessor"],
|
423
|
+
"tools_mock_work": [],
|
424
|
+
"work_quality": "real"
|
425
|
+
}
|
426
|
+
}
|
427
|
+
|
428
|
+
return result_data
|
429
|
+
|
430
|
+
except json.JSONDecodeError as e:
|
431
|
+
print(f"❌ JSON parsing error: {e}")
|
432
|
+
print(f"Raw response: {vision_response}")
|
433
|
+
return result_data
|
434
|
+
else:
|
435
|
+
print(f"❌ No vision_response found in PDFProcessor result")
|
436
|
+
return result_data
|
437
|
+
|
438
|
+
except Exception as e:
|
439
|
+
print(f"❌ Error in PDF processing: {e}")
|
440
|
+
return result_data
|
441
|
+
|
442
|
+
def direct_vision_processing(agent, result_data, **kwargs):
|
443
|
+
"""Direct vision model processing without using tools with retry logic"""
|
444
|
+
print(f"\n[DEBUG] direct_vision_processing called for {agent.role}")
|
445
|
+
print(f"[DEBUG] Result data type: {type(result_data)}")
|
446
|
+
print(f"[DEBUG] Result data: {result_data}")
|
447
|
+
print(f"[DEBUG] Kwargs: {kwargs}")
|
448
|
+
|
449
|
+
# Get the file path from the input data - check kwargs['input'] first
|
450
|
+
input_data = kwargs.get('input', {})
|
451
|
+
file_path = input_data.get('file', '') or kwargs.get('file', '') or result_data.get('file', '')
|
452
|
+
print(f"[DEBUG] File path: {file_path}")
|
453
|
+
|
454
|
+
# Get the invoice schema from previous agent results
|
455
|
+
results = kwargs.get('results', {})
|
456
|
+
invoice_schema = results.get('invoice_schema', {})
|
457
|
+
schema_results = invoice_schema.get('results', [])
|
458
|
+
print(f"[DEBUG] Schema fields: {[col['column_name'] for col in schema_results]}")
|
459
|
+
|
460
|
+
if not file_path:
|
461
|
+
print("❌ No file path provided")
|
462
|
+
return result_data
|
463
|
+
|
464
|
+
# Retry logic for vision processing
|
465
|
+
for attempt in range(PROCESSING_CONFIG["max_retries"] + 1):
|
466
|
+
try:
|
467
|
+
import requests
|
468
|
+
import json
|
469
|
+
import os
|
470
|
+
import base64
|
471
|
+
|
472
|
+
# Use the remote API for PDF processing
|
473
|
+
api_url = "https://api.memra.co"
|
474
|
+
api_key = os.getenv("MEMRA_API_KEY", "test-secret-for-development")
|
475
|
+
|
476
|
+
# Check if file is already a remote path
|
477
|
+
if file_path.startswith('/uploads/'):
|
478
|
+
print(f"✅ File already uploaded to remote API: {file_path}")
|
479
|
+
remote_path = file_path
|
480
|
+
else:
|
481
|
+
# Local file - need to upload
|
482
|
+
print(f"📤 Uploading file to remote API (attempt {attempt + 1})...")
|
483
|
+
|
484
|
+
# Read the file and encode as base64
|
485
|
+
with open(file_path, 'rb') as f:
|
486
|
+
file_content = f.read()
|
487
|
+
|
488
|
+
file_b64 = base64.b64encode(file_content).decode('utf-8')
|
489
|
+
|
490
|
+
# Prepare upload data
|
491
|
+
upload_data = {
|
492
|
+
"filename": os.path.basename(file_path),
|
493
|
+
"content": file_b64,
|
494
|
+
"content_type": "application/pdf"
|
495
|
+
}
|
496
|
+
|
497
|
+
# Upload to remote API with timeout
|
498
|
+
response = requests.post(
|
499
|
+
f"{api_url}/upload",
|
500
|
+
json=upload_data,
|
501
|
+
headers={
|
502
|
+
"X-API-Key": api_key,
|
503
|
+
"Content-Type": "application/json"
|
504
|
+
},
|
505
|
+
timeout=PROCESSING_CONFIG["timeout_seconds"]
|
506
|
+
)
|
507
|
+
|
508
|
+
if response.status_code != 200:
|
509
|
+
print(f"❌ Upload failed: {response.status_code}")
|
510
|
+
print(f" Response: {response.text}")
|
511
|
+
|
512
|
+
# Check for rate limiting
|
513
|
+
if response.status_code == 429:
|
514
|
+
delay = PROCESSING_CONFIG["rate_limit_delay"] * (2 ** attempt)
|
515
|
+
print(f"⏳ Rate limited, waiting {delay}s before retry...")
|
516
|
+
time.sleep(delay)
|
517
|
+
continue
|
518
|
+
else:
|
519
|
+
return result_data
|
520
|
+
|
521
|
+
upload_result = response.json()
|
522
|
+
if not upload_result.get("success"):
|
523
|
+
print(f"❌ Upload failed: {upload_result.get('error')}")
|
524
|
+
return result_data
|
525
|
+
|
526
|
+
remote_path = upload_result["data"]["remote_path"]
|
527
|
+
print(f"✅ File uploaded successfully")
|
528
|
+
print(f" Remote path: {remote_path}")
|
529
|
+
|
530
|
+
# Now call the PDFProcessor with the remote path
|
531
|
+
print(f"🔍 Calling PDFProcessor with remote path (attempt {attempt + 1})...")
|
532
|
+
|
533
|
+
# Convert schema to format expected by PDFProcessor
|
534
|
+
schema_for_pdf = None
|
535
|
+
if schema_results:
|
536
|
+
# Send the raw schema array - server now handles both formats
|
537
|
+
schema_for_pdf = [
|
538
|
+
col for col in schema_results
|
539
|
+
if col["column_name"] not in ["id", "created_at", "updated_at", "status", "raw_json"]
|
540
|
+
]
|
541
|
+
print(f"📋 Passing schema with {len(schema_for_pdf)} fields to PDFProcessor")
|
542
|
+
print(f"📋 Schema fields: {[c['column_name'] for c in schema_for_pdf]}")
|
543
|
+
|
544
|
+
response = requests.post(
|
545
|
+
f"{api_url}/tools/execute",
|
546
|
+
json={
|
547
|
+
"tool_name": "PDFProcessor",
|
548
|
+
"hosted_by": "memra",
|
549
|
+
"input_data": {
|
550
|
+
"file": remote_path,
|
551
|
+
"schema": schema_for_pdf
|
552
|
+
}
|
553
|
+
},
|
554
|
+
headers={
|
555
|
+
"X-API-Key": api_key,
|
556
|
+
"Content-Type": "application/json"
|
557
|
+
},
|
558
|
+
timeout=PROCESSING_CONFIG["timeout_seconds"]
|
559
|
+
)
|
560
|
+
|
561
|
+
if response.status_code != 200:
|
562
|
+
print(f"❌ PDFProcessor call failed: {response.status_code}")
|
563
|
+
print(f" Response: {response.text}")
|
564
|
+
|
565
|
+
# Check for rate limiting
|
566
|
+
if response.status_code == 429:
|
567
|
+
delay = PROCESSING_CONFIG["rate_limit_delay"] * (2 ** attempt)
|
568
|
+
print(f"⏳ Rate limited, waiting {delay}s before retry...")
|
569
|
+
time.sleep(delay)
|
570
|
+
continue
|
571
|
+
else:
|
572
|
+
return result_data
|
573
|
+
|
574
|
+
pdf_result = response.json()
|
575
|
+
print(f"\n🎯 AGENT 3 - FULL PDFPROCESSOR RESPONSE:")
|
576
|
+
print("=" * 60)
|
577
|
+
print(json.dumps(pdf_result, indent=2, default=str))
|
578
|
+
print("=" * 60)
|
579
|
+
|
580
|
+
# Extract the vision response from the nested structure
|
581
|
+
vision_response = None
|
582
|
+
if pdf_result.get("success") and "data" in pdf_result:
|
583
|
+
data = pdf_result["data"]
|
584
|
+
|
585
|
+
# Check for nested data structure
|
586
|
+
if isinstance(data, dict) and "data" in data:
|
587
|
+
actual_data = data["data"]
|
588
|
+
if "vision_response" in actual_data:
|
589
|
+
vision_response = actual_data["vision_response"]
|
590
|
+
elif "vision_response" in data:
|
591
|
+
vision_response = data["vision_response"]
|
592
|
+
|
593
|
+
if vision_response:
|
594
|
+
print(f"\n🎯 AGENT 3 - RAW VISION MODEL JSON:")
|
595
|
+
print("=" * 60)
|
596
|
+
print(vision_response)
|
597
|
+
print("=" * 60)
|
598
|
+
|
599
|
+
# Try to parse the JSON response
|
600
|
+
try:
|
601
|
+
# Clean up the response - remove markdown code blocks if present
|
602
|
+
cleaned_response = vision_response
|
603
|
+
if cleaned_response.startswith("```json"):
|
604
|
+
cleaned_response = cleaned_response.replace("```json", "").replace("```", "").strip()
|
605
|
+
elif cleaned_response.startswith("```"):
|
606
|
+
cleaned_response = cleaned_response.replace("```", "").strip()
|
607
|
+
|
608
|
+
parsed_data = json.loads(cleaned_response)
|
609
|
+
print(f"\n✅ [AGENT 3] Successfully parsed JSON:")
|
610
|
+
print(json.dumps(parsed_data, indent=2))
|
611
|
+
|
612
|
+
# Convert to the expected format
|
613
|
+
extracted_data = convert_vision_response_to_extracted_data(cleaned_response)
|
614
|
+
|
615
|
+
# Debug vendor extraction
|
616
|
+
print(f"\n🔍 [AGENT 3] Extracted vendor: '{extracted_data['headerSection']['vendorName']}'")
|
617
|
+
print(f" Invoice #: {extracted_data['billingDetails']['invoiceNumber']}")
|
618
|
+
print(f" Amount: ${extracted_data['chargesSummary']['document_total']}")
|
619
|
+
|
620
|
+
# Update the result_data
|
621
|
+
result_data = {
|
622
|
+
"success": True,
|
623
|
+
"data": {
|
624
|
+
"vision_response": vision_response,
|
625
|
+
"extracted_data": extracted_data
|
626
|
+
},
|
627
|
+
"_memra_metadata": {
|
628
|
+
"agent_role": agent.role,
|
629
|
+
"tools_real_work": ["PDFProcessor"],
|
630
|
+
"tools_mock_work": [],
|
631
|
+
"work_quality": "real"
|
632
|
+
}
|
633
|
+
}
|
634
|
+
|
635
|
+
return result_data
|
636
|
+
|
637
|
+
except json.JSONDecodeError as e:
|
638
|
+
print(f"❌ JSON parsing error: {e}")
|
639
|
+
print(f"Raw response: {vision_response}")
|
640
|
+
|
641
|
+
# Don't retry on JSON parsing errors
|
642
|
+
return result_data
|
643
|
+
else:
|
644
|
+
print(f"❌ No vision_response found in PDFProcessor result")
|
645
|
+
|
646
|
+
# Retry if no vision response (might be temporary API issue)
|
647
|
+
if attempt < PROCESSING_CONFIG["max_retries"]:
|
648
|
+
delay = PROCESSING_CONFIG["retry_delay_base"] * (2 ** attempt)
|
649
|
+
print(f"⏳ No vision response, waiting {delay}s before retry...")
|
650
|
+
time.sleep(delay)
|
651
|
+
continue
|
652
|
+
else:
|
653
|
+
return result_data
|
654
|
+
|
655
|
+
except requests.exceptions.Timeout:
|
656
|
+
print(f"⏰ Vision processing timeout (attempt {attempt + 1})")
|
657
|
+
if attempt < PROCESSING_CONFIG["max_retries"]:
|
658
|
+
delay = PROCESSING_CONFIG["retry_delay_base"] * (2 ** attempt)
|
659
|
+
print(f"⏳ Waiting {delay}s before retry...")
|
660
|
+
time.sleep(delay)
|
661
|
+
continue
|
662
|
+
except Exception as e:
|
663
|
+
print(f"❌ Error in PDF processing (attempt {attempt + 1}): {e}")
|
664
|
+
if attempt < PROCESSING_CONFIG["max_retries"]:
|
665
|
+
delay = PROCESSING_CONFIG["retry_delay_base"] * (2 ** attempt)
|
666
|
+
print(f"⏳ Waiting {delay}s before retry...")
|
667
|
+
time.sleep(delay)
|
668
|
+
continue
|
669
|
+
|
670
|
+
print(f"❌ Failed to process vision after {PROCESSING_CONFIG['max_retries'] + 1} attempts")
|
671
|
+
return result_data
|
672
|
+
|
673
|
+
# Create a new Agent 3 that bypasses the tool system
|
674
|
+
direct_vision_agent = Agent(
|
675
|
+
role="Invoice Parser",
|
676
|
+
job="Extract structured data from invoice PDF using vision model",
|
677
|
+
llm=parsing_llm,
|
678
|
+
sops=[
|
679
|
+
"Load invoice PDF file",
|
680
|
+
"Send to vision model for field extraction",
|
681
|
+
"Print out the raw JSON data returned by vision model tools",
|
682
|
+
"Validate extracted data against schema types",
|
683
|
+
"Return structured invoice data"
|
684
|
+
],
|
685
|
+
systems=["InvoiceStore"],
|
686
|
+
tools=[], # No tools - we'll do direct API calls in custom processing
|
687
|
+
input_keys=["file", "invoice_schema"],
|
688
|
+
output_key="invoice_data",
|
689
|
+
custom_processing=direct_vision_processing
|
690
|
+
)
|
691
|
+
|
692
|
+
parser_agent = Agent(
|
693
|
+
role="Invoice Parser",
|
694
|
+
job="Extract structured data from invoice PDF using vision model",
|
695
|
+
llm=parsing_llm,
|
696
|
+
sops=[
|
697
|
+
"Load invoice PDF file",
|
698
|
+
"Send to vision model for field extraction",
|
699
|
+
"Print out the raw JSON data returned by vision model tools",
|
700
|
+
"Validate extracted data against schema types",
|
701
|
+
"Return structured invoice data"
|
702
|
+
],
|
703
|
+
systems=["InvoiceStore"],
|
704
|
+
tools=[
|
705
|
+
{"name": "PDFProcessor", "hosted_by": "memra", "input_keys": ["file_path"]}
|
706
|
+
],
|
707
|
+
input_keys=["file", "invoice_schema"],
|
708
|
+
output_key="invoice_data",
|
709
|
+
custom_processing=pdf_processing_with_remote_api
|
710
|
+
)
|
711
|
+
|
712
|
+
def process_database_insertion(agent, tool_results, **kwargs):
|
713
|
+
"""Custom processing for Agent 4 to properly map invoice data to database format"""
|
714
|
+
print(f"\n[DEBUG] process_database_insertion called for {agent.role}")
|
715
|
+
|
716
|
+
# Get the invoice data from kwargs
|
717
|
+
input_data = kwargs.get('input', {})
|
718
|
+
results = kwargs.get('results', {})
|
719
|
+
|
720
|
+
# Try to get invoice_data from various sources
|
721
|
+
invoice_data = (
|
722
|
+
results.get('invoice_data') or
|
723
|
+
input_data.get('invoice_data') or
|
724
|
+
kwargs.get('invoice_data', {})
|
725
|
+
)
|
726
|
+
|
727
|
+
print(f"[DEBUG] Invoice data type: {type(invoice_data)}")
|
728
|
+
print(f"[DEBUG] Invoice data keys: {list(invoice_data.keys()) if isinstance(invoice_data, dict) else 'Not a dict'}")
|
729
|
+
|
730
|
+
# Transform the data for database insertion
|
731
|
+
if isinstance(invoice_data, dict):
|
732
|
+
# Create the properly formatted data for database
|
733
|
+
db_data = {}
|
734
|
+
|
735
|
+
# Check if data is in the new format (headerSection, billingDetails, etc.)
|
736
|
+
if 'headerSection' in invoice_data and 'billingDetails' in invoice_data:
|
737
|
+
header = invoice_data.get('headerSection', {})
|
738
|
+
billing = invoice_data.get('billingDetails', {})
|
739
|
+
charges = invoice_data.get('chargesSummary', {})
|
740
|
+
|
741
|
+
db_data = {
|
742
|
+
'vendor_name': header.get('vendorName', ''),
|
743
|
+
'invoice_number': billing.get('invoiceNumber', ''),
|
744
|
+
'invoice_date': billing.get('invoiceDate', ''),
|
745
|
+
'due_date': billing.get('dueDate', ''),
|
746
|
+
'total_amount': charges.get('document_total', 0),
|
747
|
+
'tax_amount': charges.get('secondary_tax', 0),
|
748
|
+
'line_items': json.dumps(charges.get('lineItemsBreakdown', []))
|
749
|
+
}
|
750
|
+
|
751
|
+
print(f"\n💾 [AGENT 4] Prepared database record:")
|
752
|
+
print(f" vendor_name: '{db_data['vendor_name']}'")
|
753
|
+
print(f" invoice_number: '{db_data['invoice_number']}'")
|
754
|
+
print(f" invoice_date: '{db_data['invoice_date']}'")
|
755
|
+
print(f" total_amount: {db_data['total_amount']}")
|
756
|
+
|
757
|
+
# Check if data is in the old format
|
758
|
+
elif 'extracted_data' in invoice_data:
|
759
|
+
extracted = invoice_data['extracted_data']
|
760
|
+
if isinstance(extracted, dict):
|
761
|
+
if 'headerSection' in extracted:
|
762
|
+
# Nested new format
|
763
|
+
header = extracted.get('headerSection', {})
|
764
|
+
billing = extracted.get('billingDetails', {})
|
765
|
+
charges = extracted.get('chargesSummary', {})
|
766
|
+
|
767
|
+
db_data = {
|
768
|
+
'vendor_name': header.get('vendorName', ''),
|
769
|
+
'invoice_number': billing.get('invoiceNumber', ''),
|
770
|
+
'invoice_date': billing.get('invoiceDate', ''),
|
771
|
+
'due_date': billing.get('dueDate', ''),
|
772
|
+
'total_amount': charges.get('document_total', 0),
|
773
|
+
'tax_amount': charges.get('secondary_tax', 0),
|
774
|
+
'line_items': json.dumps(charges.get('lineItemsBreakdown', []))
|
775
|
+
}
|
776
|
+
else:
|
777
|
+
# Old flat format
|
778
|
+
db_data = {
|
779
|
+
'vendor_name': extracted.get('vendor_name', ''),
|
780
|
+
'invoice_number': extracted.get('invoice_number', ''),
|
781
|
+
'invoice_date': extracted.get('invoice_date', ''),
|
782
|
+
'due_date': extracted.get('due_date', ''),
|
783
|
+
'total_amount': extracted.get('amount', extracted.get('total_amount', 0)),
|
784
|
+
'tax_amount': extracted.get('tax_amount', 0),
|
785
|
+
'line_items': json.dumps(extracted.get('line_items', []))
|
786
|
+
}
|
787
|
+
|
788
|
+
# Update tool parameters with the transformed data
|
789
|
+
for tool_name, result in tool_results.items():
|
790
|
+
if tool_name == "PostgresInsert" and db_data:
|
791
|
+
# Inject the properly formatted data into the tool parameters
|
792
|
+
if 'parameters' not in result:
|
793
|
+
result['parameters'] = {}
|
794
|
+
result['parameters']['data'] = db_data
|
795
|
+
print(f"\n✅ [AGENT 4] Injected transformed data into PostgresInsert parameters")
|
796
|
+
|
797
|
+
# Call the original print function for debugging
|
798
|
+
print_database_data(agent, tool_results, invoice_data)
|
799
|
+
|
800
|
+
return tool_results
|
801
|
+
|
802
|
+
writer_agent = Agent(
|
803
|
+
role="Data Entry Specialist",
|
804
|
+
job="Write validated invoice data to database",
|
805
|
+
llm=default_llm,
|
806
|
+
sops=[
|
807
|
+
"Validate invoice data completeness",
|
808
|
+
"Map fields to database columns using schema",
|
809
|
+
"Print out the data being inserted into database",
|
810
|
+
"Connect to database",
|
811
|
+
"Insert record into invoices table",
|
812
|
+
"Return confirmation with record ID"
|
813
|
+
],
|
814
|
+
systems=["Database"],
|
815
|
+
tools=[
|
816
|
+
{"name": "DataValidator", "hosted_by": "mcp"},
|
817
|
+
{"name": "PostgresInsert", "hosted_by": "mcp"}
|
818
|
+
],
|
819
|
+
input_keys=["invoice_data", "invoice_schema"],
|
820
|
+
output_key="write_confirmation",
|
821
|
+
custom_processing=process_database_insertion
|
822
|
+
)
|
823
|
+
|
824
|
+
post_monitor_agent = create_simple_monitor_agent()
|
825
|
+
post_monitor_agent.role = "Post-ETL Database Monitor"
|
826
|
+
|
827
|
+
manager_agent = Agent(
|
828
|
+
role="ETL Process Manager",
|
829
|
+
job="Coordinate ETL pipeline and validate data integrity",
|
830
|
+
llm=manager_llm,
|
831
|
+
sops=[
|
832
|
+
"Review pre-ETL database state",
|
833
|
+
"Validate ETL process completion",
|
834
|
+
"Compare pre and post database states",
|
835
|
+
"Generate ETL summary report",
|
836
|
+
"Flag any data quality issues"
|
837
|
+
],
|
838
|
+
allow_delegation=True,
|
839
|
+
output_key="etl_summary"
|
840
|
+
)
|
841
|
+
|
842
|
+
# Create ETL department
|
843
|
+
etl_department = Department(
|
844
|
+
name="ETL Invoice Processing",
|
845
|
+
mission="Complete end-to-end ETL process with comprehensive monitoring",
|
846
|
+
agents=[pre_monitor_agent, etl_agent, direct_vision_agent, writer_agent, post_monitor_agent],
|
847
|
+
manager_agent=manager_agent,
|
848
|
+
workflow_order=[
|
849
|
+
"Pre-ETL Database Monitor",
|
850
|
+
"Data Engineer",
|
851
|
+
"Invoice Parser",
|
852
|
+
"Data Entry Specialist",
|
853
|
+
"Post-ETL Database Monitor"
|
854
|
+
],
|
855
|
+
dependencies=["Database", "InvoiceStore"],
|
856
|
+
execution_policy={
|
857
|
+
"retry_on_fail": True,
|
858
|
+
"max_retries": 2,
|
859
|
+
"halt_on_validation_error": True,
|
860
|
+
"timeout_seconds": 300
|
861
|
+
},
|
862
|
+
context={
|
863
|
+
"company_id": "acme_corp",
|
864
|
+
"fiscal_year": "2024",
|
865
|
+
"mcp_bridge_url": "http://localhost:8081",
|
866
|
+
"mcp_bridge_secret": "test-secret-for-development"
|
867
|
+
}
|
868
|
+
)
|
869
|
+
|
870
|
+
def upload_file_to_api(file_path: str, api_url: str = "https://api.memra.co", max_retries: int = 3) -> str:
|
871
|
+
"""Upload a file to the remote API for vision-based PDF processing with retry logic"""
|
872
|
+
|
873
|
+
for attempt in range(max_retries + 1):
|
874
|
+
try:
|
875
|
+
print(f"📤 Uploading {os.path.basename(file_path)} to remote API (attempt {attempt + 1}/{max_retries + 1})")
|
876
|
+
print(f" File path: {file_path}")
|
877
|
+
|
878
|
+
# Read the file and encode as base64
|
879
|
+
with open(file_path, 'rb') as f:
|
880
|
+
file_content = f.read()
|
881
|
+
|
882
|
+
file_b64 = base64.b64encode(file_content).decode('utf-8')
|
883
|
+
|
884
|
+
# Prepare upload data
|
885
|
+
upload_data = {
|
886
|
+
"filename": os.path.basename(file_path),
|
887
|
+
"content": file_b64,
|
888
|
+
"content_type": "application/pdf"
|
889
|
+
}
|
890
|
+
|
891
|
+
# Upload to remote API
|
892
|
+
api_key = os.getenv("MEMRA_API_KEY")
|
893
|
+
response = requests.post(
|
894
|
+
f"{api_url}/upload",
|
895
|
+
json=upload_data,
|
896
|
+
headers={
|
897
|
+
"X-API-Key": api_key,
|
898
|
+
"Content-Type": "application/json"
|
899
|
+
},
|
900
|
+
timeout=PROCESSING_CONFIG["timeout_seconds"]
|
901
|
+
)
|
902
|
+
|
903
|
+
if response.status_code == 200:
|
904
|
+
result = response.json()
|
905
|
+
if result.get("success"):
|
906
|
+
remote_path = result["data"]["remote_path"]
|
907
|
+
print(f"✅ File uploaded successfully")
|
908
|
+
print(f" Remote path: {remote_path}")
|
909
|
+
return remote_path
|
910
|
+
else:
|
911
|
+
error_msg = result.get('error', 'Unknown error')
|
912
|
+
print(f"❌ Upload failed: {error_msg}")
|
913
|
+
|
914
|
+
# Check if it's a rate limit error
|
915
|
+
if "rate limit" in error_msg.lower() or "too many requests" in error_msg.lower():
|
916
|
+
delay = PROCESSING_CONFIG["rate_limit_delay"] * (2 ** attempt)
|
917
|
+
print(f"⏳ Rate limited, waiting {delay}s before retry...")
|
918
|
+
time.sleep(delay)
|
919
|
+
continue
|
920
|
+
elif response.status_code == 429: # Rate limited
|
921
|
+
delay = PROCESSING_CONFIG["rate_limit_delay"] * (2 ** attempt)
|
922
|
+
print(f"⏳ Rate limited (HTTP 429), waiting {delay}s before retry...")
|
923
|
+
time.sleep(delay)
|
924
|
+
continue
|
925
|
+
else:
|
926
|
+
print(f"❌ Upload request failed: {response.status_code}")
|
927
|
+
print(f" Response: {response.text}")
|
928
|
+
|
929
|
+
# Don't retry on client errors (4xx) except 429
|
930
|
+
if 400 <= response.status_code < 500 and response.status_code != 429:
|
931
|
+
break
|
932
|
+
|
933
|
+
except requests.exceptions.Timeout:
|
934
|
+
print(f"⏰ Upload timeout (attempt {attempt + 1})")
|
935
|
+
if attempt < max_retries:
|
936
|
+
delay = PROCESSING_CONFIG["retry_delay_base"] * (2 ** attempt)
|
937
|
+
print(f"⏳ Waiting {delay}s before retry...")
|
938
|
+
time.sleep(delay)
|
939
|
+
continue
|
940
|
+
except Exception as e:
|
941
|
+
print(f"⚠️ Upload error (attempt {attempt + 1}): {e}")
|
942
|
+
if attempt < max_retries:
|
943
|
+
delay = PROCESSING_CONFIG["retry_delay_base"] * (2 ** attempt)
|
944
|
+
print(f"⏳ Waiting {delay}s before retry...")
|
945
|
+
time.sleep(delay)
|
946
|
+
continue
|
947
|
+
|
948
|
+
print(f"❌ Failed to upload {os.path.basename(file_path)} after {max_retries + 1} attempts")
|
949
|
+
return file_path
|
950
|
+
|
951
|
+
def print_vision_model_data(agent, tool_results):
|
952
|
+
"""Print out the JSON data returned by vision model tools"""
|
953
|
+
print(f"\n🔍 {agent.role}: VISION MODEL DATA ANALYSIS")
|
954
|
+
print("=" * 60)
|
955
|
+
|
956
|
+
for tool_name, result in tool_results.items():
|
957
|
+
print(f"\n📊 Tool: {tool_name}")
|
958
|
+
print(f"✅ Success: {result.get('success', 'Unknown')}")
|
959
|
+
|
960
|
+
if 'data' in result:
|
961
|
+
data = result['data']
|
962
|
+
print(f"📄 Data Structure:")
|
963
|
+
print(f" - Keys: {list(data.keys())}")
|
964
|
+
|
965
|
+
# Print extracted text if available
|
966
|
+
if 'extracted_text' in data:
|
967
|
+
text = data['extracted_text']
|
968
|
+
print(f"📝 Extracted Text ({len(text)} chars):")
|
969
|
+
print(f" {text[:200]}{'...' if len(text) > 200 else ''}")
|
970
|
+
|
971
|
+
# Print extracted data if available
|
972
|
+
if 'extracted_data' in data:
|
973
|
+
extracted = data['extracted_data']
|
974
|
+
print(f"🎯 Extracted Data:")
|
975
|
+
|
976
|
+
# Handle both old and new formats
|
977
|
+
if 'headerSection' in extracted:
|
978
|
+
# New format (converted)
|
979
|
+
header = extracted.get('headerSection', {})
|
980
|
+
billing = extracted.get('billingDetails', {})
|
981
|
+
charges = extracted.get('chargesSummary', {})
|
982
|
+
print(f" Vendor: {header.get('vendorName', 'N/A')}")
|
983
|
+
print(f" Invoice #: {billing.get('invoiceNumber', 'N/A')}")
|
984
|
+
print(f" Date: {billing.get('invoiceDate', 'N/A')}")
|
985
|
+
print(f" Amount: ${charges.get('document_total', 'N/A')}")
|
986
|
+
print(f" Tax: ${charges.get('secondary_tax', 'N/A')}")
|
987
|
+
print(f" Line Items: {len(charges.get('lineItemsBreakdown', []))} items")
|
988
|
+
else:
|
989
|
+
# Old format (direct)
|
990
|
+
print(f" Vendor: {extracted.get('vendor_name', 'N/A')}")
|
991
|
+
print(f" Invoice #: {extracted.get('invoice_number', 'N/A')}")
|
992
|
+
print(f" Date: {extracted.get('invoice_date', 'N/A')}")
|
993
|
+
print(f" Amount: ${extracted.get('amount', 'N/A')}")
|
994
|
+
print(f" Tax: ${extracted.get('tax_amount', 'N/A')}")
|
995
|
+
print(f" Line Items: {extracted.get('line_items', 'N/A')}")
|
996
|
+
|
997
|
+
# Print screenshot info if available
|
998
|
+
if 'screenshots_dir' in data:
|
999
|
+
print(f"📸 Screenshots:")
|
1000
|
+
print(f" Directory: {data.get('screenshots_dir', 'N/A')}")
|
1001
|
+
print(f" Count: {data.get('screenshot_count', 'N/A')}")
|
1002
|
+
print(f" Invoice ID: {data.get('invoice_id', 'N/A')}")
|
1003
|
+
|
1004
|
+
if 'error' in result:
|
1005
|
+
print(f"❌ Error: {result['error']}")
|
1006
|
+
|
1007
|
+
print("=" * 60)
|
1008
|
+
|
1009
|
+
def print_database_data(agent, tool_results, invoice_data):
|
1010
|
+
"""Print out the data being inserted into database"""
|
1011
|
+
print(f"\n💾 {agent.role}: DATABASE INSERTION DATA")
|
1012
|
+
print("=" * 60)
|
1013
|
+
|
1014
|
+
if invoice_data:
|
1015
|
+
print(f"📊 Invoice Data to Insert:")
|
1016
|
+
if isinstance(invoice_data, dict) and 'extracted_data' in invoice_data:
|
1017
|
+
data = invoice_data['extracted_data']
|
1018
|
+
print(f" Vendor: '{data.get('vendor_name', '')}'")
|
1019
|
+
print(f" Invoice #: '{data.get('invoice_number', '')}'")
|
1020
|
+
print(f" Date: '{data.get('invoice_date', '')}'")
|
1021
|
+
print(f" Amount: {data.get('amount', 0)}")
|
1022
|
+
print(f" Tax: {data.get('tax_amount', 0)}")
|
1023
|
+
print(f" Line Items: '{data.get('line_items', '')}'")
|
1024
|
+
else:
|
1025
|
+
print(f" Raw data: {invoice_data}")
|
1026
|
+
|
1027
|
+
for tool_name, result in tool_results.items():
|
1028
|
+
print(f"\n🔧 Tool: {tool_name}")
|
1029
|
+
print(f"✅ Success: {result.get('success', 'Unknown')}")
|
1030
|
+
|
1031
|
+
if 'data' in result:
|
1032
|
+
data = result['data']
|
1033
|
+
print(f"📄 Result Data:")
|
1034
|
+
for key, value in data.items():
|
1035
|
+
print(f" {key}: {value}")
|
1036
|
+
|
1037
|
+
if 'error' in result:
|
1038
|
+
print(f"❌ Error: {result['error']}")
|
1039
|
+
|
1040
|
+
print("=" * 60)
|
1041
|
+
|
1042
|
+
def switch_to_remote_api_for_pdf():
|
1043
|
+
"""Temporarily switch to remote API for PDF processing"""
|
1044
|
+
original_url = os.environ.get("MEMRA_API_URL")
|
1045
|
+
os.environ["MEMRA_API_URL"] = REMOTE_API_URL
|
1046
|
+
return original_url
|
1047
|
+
|
1048
|
+
def restore_api_url(original_url):
|
1049
|
+
"""Restore the original API URL"""
|
1050
|
+
if original_url:
|
1051
|
+
os.environ["MEMRA_API_URL"] = original_url
|
1052
|
+
|
1053
|
+
def validate_agent_configuration(department):
|
1054
|
+
"""Validate that critical agents have required tools configured"""
|
1055
|
+
critical_agents = {
|
1056
|
+
"Invoice Parser": ["PDFProcessor"],
|
1057
|
+
"Data Entry Specialist": ["DataValidator", "PostgresInsert"],
|
1058
|
+
"Data Engineer": ["SQLExecutor"]
|
1059
|
+
}
|
1060
|
+
|
1061
|
+
for agent in department.agents:
|
1062
|
+
if agent.role in critical_agents:
|
1063
|
+
# Skip validation if agent has custom processing function
|
1064
|
+
if hasattr(agent, 'custom_processing') and agent.custom_processing is not None:
|
1065
|
+
print(f"ℹ️ {agent.role} uses custom processing (tools validation skipped)")
|
1066
|
+
continue
|
1067
|
+
|
1068
|
+
required_tools = critical_agents[agent.role]
|
1069
|
+
# Handle both Tool objects and dictionaries
|
1070
|
+
configured_tools = []
|
1071
|
+
for tool in agent.tools:
|
1072
|
+
if isinstance(tool, dict):
|
1073
|
+
configured_tools.append(tool["name"])
|
1074
|
+
else:
|
1075
|
+
configured_tools.append(tool.name)
|
1076
|
+
|
1077
|
+
missing_tools = [tool for tool in required_tools if tool not in configured_tools]
|
1078
|
+
if missing_tools:
|
1079
|
+
print(f"⚠️ WARNING: {agent.role} is missing critical tools: {missing_tools}")
|
1080
|
+
print(f" Configured tools: {configured_tools}")
|
1081
|
+
return False
|
1082
|
+
|
1083
|
+
return True
|
1084
|
+
|
1085
|
+
def main():
|
1086
|
+
"""Run the ETL demo workflow with robust processing"""
|
1087
|
+
print("\n🚀 Starting ETL Invoice Processing Demo...")
|
1088
|
+
print("📊 This demo includes comprehensive database monitoring")
|
1089
|
+
print("📡 Tools will execute on Memra API server")
|
1090
|
+
print("📝 Processing 15 specific invoice files with robust error handling")
|
1091
|
+
print("⏱️ Includes delays between files and retry logic for API resilience")
|
1092
|
+
print("🎯 Target files:", ", ".join(TARGET_FILES))
|
1093
|
+
|
1094
|
+
# Configuration
|
1095
|
+
config = {
|
1096
|
+
"table_name": os.getenv("MEMRA_TABLE_NAME", "invoices"),
|
1097
|
+
"data_directory": os.getenv("MEMRA_DATA_DIR", "data/invoices"),
|
1098
|
+
"company_id": os.getenv("MEMRA_COMPANY_ID", "acme_corp"),
|
1099
|
+
"fiscal_year": os.getenv("MEMRA_FISCAL_YEAR", "2024"),
|
1100
|
+
"database_url": os.getenv("MEMRA_DATABASE_URL", "postgresql://memra:memra123@localhost:5432/memra_invoice_db")
|
1101
|
+
}
|
1102
|
+
|
1103
|
+
# Generate schema query dynamically
|
1104
|
+
schema_query = f"SELECT column_name, data_type, is_nullable, column_default FROM information_schema.columns WHERE table_name = '{config['table_name']}' ORDER BY ordinal_position"
|
1105
|
+
|
1106
|
+
# Validate agent configuration before proceeding
|
1107
|
+
if not validate_agent_configuration(etl_department):
|
1108
|
+
print("❌ Critical agents are missing required tools!")
|
1109
|
+
print("⚠️ Please fix agent configuration before running ETL process")
|
1110
|
+
sys.exit(1)
|
1111
|
+
|
1112
|
+
engine = ExecutionEngine()
|
1113
|
+
|
1114
|
+
# Use configurable data directory
|
1115
|
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
1116
|
+
data_dir = os.path.join(current_dir, config["data_directory"])
|
1117
|
+
|
1118
|
+
# Find only the target files
|
1119
|
+
invoice_files = []
|
1120
|
+
missing_files = []
|
1121
|
+
|
1122
|
+
for target_file in TARGET_FILES:
|
1123
|
+
file_path = os.path.join(data_dir, target_file)
|
1124
|
+
if os.path.exists(file_path):
|
1125
|
+
invoice_files.append(file_path)
|
1126
|
+
else:
|
1127
|
+
missing_files.append(target_file)
|
1128
|
+
|
1129
|
+
if missing_files:
|
1130
|
+
print(f"⚠️ Missing files: {', '.join(missing_files)}")
|
1131
|
+
|
1132
|
+
if not invoice_files:
|
1133
|
+
print(f"❌ No target files found in {config['data_directory']}/ directory")
|
1134
|
+
print("📝 Available files:")
|
1135
|
+
available_files = glob.glob(os.path.join(data_dir, "*.PDF"))
|
1136
|
+
for file in available_files[:10]: # Show first 10
|
1137
|
+
print(f" - {os.path.basename(file)}")
|
1138
|
+
if len(available_files) > 10:
|
1139
|
+
print(f" ... and {len(available_files) - 10} more")
|
1140
|
+
sys.exit(1)
|
1141
|
+
|
1142
|
+
print(f"\n📁 Found {len(invoice_files)} target files to process")
|
1143
|
+
print(f"⏱️ Estimated processing time: {len(invoice_files) * PROCESSING_CONFIG['delay_between_files']:.1f} seconds (plus processing time)")
|
1144
|
+
|
1145
|
+
# Process files with robust error handling
|
1146
|
+
successful_processing = 0
|
1147
|
+
failed_processing = 0
|
1148
|
+
skipped_processing = 0
|
1149
|
+
|
1150
|
+
for idx, invoice_file in enumerate(invoice_files):
|
1151
|
+
filename = os.path.basename(invoice_file)
|
1152
|
+
print(f"\n{'='*60}")
|
1153
|
+
print(f"📄 Processing file {idx + 1}/{len(invoice_files)}: {filename}")
|
1154
|
+
print(f"{'='*60}")
|
1155
|
+
|
1156
|
+
# Add delay between files (except for the first one)
|
1157
|
+
if idx > 0:
|
1158
|
+
delay = PROCESSING_CONFIG["delay_between_files"] + random.uniform(0, 1) # Add some randomness
|
1159
|
+
print(f"⏳ Waiting {delay:.1f}s between files...")
|
1160
|
+
time.sleep(delay)
|
1161
|
+
|
1162
|
+
try:
|
1163
|
+
# Upload file with retry logic
|
1164
|
+
remote_file_path = upload_file_to_api(invoice_file, max_retries=PROCESSING_CONFIG["max_retries"])
|
1165
|
+
|
1166
|
+
if remote_file_path == invoice_file:
|
1167
|
+
print(f"❌ Failed to upload {filename}, skipping...")
|
1168
|
+
failed_processing += 1
|
1169
|
+
continue
|
1170
|
+
|
1171
|
+
# Run the full ETL workflow with configurable parameters
|
1172
|
+
input_data = {
|
1173
|
+
"file": remote_file_path,
|
1174
|
+
"connection": config["database_url"],
|
1175
|
+
"table_name": config["table_name"],
|
1176
|
+
"sql_query": schema_query
|
1177
|
+
}
|
1178
|
+
|
1179
|
+
result = engine.execute_department(etl_department, input_data)
|
1180
|
+
|
1181
|
+
if result.success:
|
1182
|
+
successful_processing += 1
|
1183
|
+
print(f"\n✅ Successfully processed: {filename}")
|
1184
|
+
|
1185
|
+
# Show summary if available
|
1186
|
+
if 'etl_summary' in result.data:
|
1187
|
+
summary = result.data['etl_summary']
|
1188
|
+
print(f"📋 Status: {summary.get('status', 'success')}")
|
1189
|
+
if 'write_confirmation' in result.data:
|
1190
|
+
write_conf = result.data['write_confirmation']
|
1191
|
+
if isinstance(write_conf, dict) and 'record_id' in write_conf:
|
1192
|
+
print(f"💾 Database Record ID: {write_conf['record_id']}")
|
1193
|
+
else:
|
1194
|
+
failed_processing += 1
|
1195
|
+
print(f"\n❌ Failed to process: {filename}")
|
1196
|
+
print(f" Error: {result.error}")
|
1197
|
+
if result.trace and result.trace.errors:
|
1198
|
+
print(" Details:")
|
1199
|
+
for error in result.trace.errors:
|
1200
|
+
print(f" - {error}")
|
1201
|
+
|
1202
|
+
except Exception as e:
|
1203
|
+
failed_processing += 1
|
1204
|
+
print(f"\n💥 Unexpected error processing {filename}: {e}")
|
1205
|
+
print(" Continuing with next file...")
|
1206
|
+
continue
|
1207
|
+
|
1208
|
+
# Final summary
|
1209
|
+
print(f"\n{'='*60}")
|
1210
|
+
print(f"🎯 ETL DEMO COMPLETED")
|
1211
|
+
print(f"{'='*60}")
|
1212
|
+
print(f"📊 Processing Summary:")
|
1213
|
+
print(f" ✅ Successful: {successful_processing}")
|
1214
|
+
print(f" ❌ Failed: {failed_processing}")
|
1215
|
+
print(f" ⏭️ Skipped: {skipped_processing}")
|
1216
|
+
print(f" 📄 Total: {len(invoice_files)}")
|
1217
|
+
|
1218
|
+
if successful_processing > 0:
|
1219
|
+
print(f"\n🎉 Demo completed successfully!")
|
1220
|
+
print(f" Processed {successful_processing} invoices with robust error handling")
|
1221
|
+
print(f" This demonstrates real-world API resilience and rate limiting")
|
1222
|
+
else:
|
1223
|
+
print(f"\n⚠️ No files were processed successfully")
|
1224
|
+
print(f" Check API connectivity and file availability")
|
1225
|
+
|
1226
|
+
print(f"\n💡 This demo shows realistic production scenarios:")
|
1227
|
+
print(f" - API rate limiting and retry logic")
|
1228
|
+
print(f" - Graceful error handling and file skipping")
|
1229
|
+
print(f" - Delays between requests to avoid overwhelming APIs")
|
1230
|
+
print(f" - Exponential backoff for failed requests")
|
1231
|
+
|
1232
|
+
if __name__ == "__main__":
|
1233
|
+
main()
|