memra 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memra/cli.py +322 -51
- {memra-0.2.13.dist-info → memra-0.2.15.dist-info}/METADATA +1 -1
- {memra-0.2.13.dist-info → memra-0.2.15.dist-info}/RECORD +7 -61
- memra-0.2.15.dist-info/top_level.txt +1 -0
- memra-0.2.13.dist-info/top_level.txt +0 -4
- memra-ops/app.py +0 -808
- memra-ops/config/config.py +0 -25
- memra-ops/config.py +0 -34
- memra-ops/logic/__init__.py +0 -1
- memra-ops/logic/file_tools.py +0 -43
- memra-ops/logic/invoice_tools.py +0 -668
- memra-ops/logic/invoice_tools_fix.py +0 -66
- memra-ops/mcp_bridge_server.py +0 -1178
- memra-ops/scripts/check_database.py +0 -37
- memra-ops/scripts/clear_database.py +0 -48
- memra-ops/scripts/monitor_database.py +0 -67
- memra-ops/scripts/release.py +0 -133
- memra-ops/scripts/reset_database.py +0 -65
- memra-ops/scripts/start_memra.py +0 -334
- memra-ops/scripts/stop_memra.py +0 -132
- memra-ops/server_tool_registry.py +0 -190
- memra-ops/tests/test_llm_text_to_sql.py +0 -115
- memra-ops/tests/test_llm_vs_pattern.py +0 -130
- memra-ops/tests/test_mcp_schema_aware.py +0 -124
- memra-ops/tests/test_schema_aware_sql.py +0 -139
- memra-ops/tests/test_schema_aware_sql_simple.py +0 -66
- memra-ops/tests/test_text_to_sql_demo.py +0 -140
- memra-ops/tools/mcp_bridge_server.py +0 -851
- memra-sdk/examples/accounts_payable.py +0 -215
- memra-sdk/examples/accounts_payable_client.py +0 -217
- memra-sdk/examples/accounts_payable_mcp.py +0 -200
- memra-sdk/examples/ask_questions.py +0 -123
- memra-sdk/examples/invoice_processing.py +0 -116
- memra-sdk/examples/propane_delivery.py +0 -87
- memra-sdk/examples/simple_text_to_sql.py +0 -158
- memra-sdk/memra/__init__.py +0 -31
- memra-sdk/memra/discovery.py +0 -15
- memra-sdk/memra/discovery_client.py +0 -49
- memra-sdk/memra/execution.py +0 -481
- memra-sdk/memra/models.py +0 -99
- memra-sdk/memra/tool_registry.py +0 -343
- memra-sdk/memra/tool_registry_client.py +0 -106
- memra-sdk/scripts/release.py +0 -133
- memra-sdk/setup.py +0 -52
- memra-workflows/accounts_payable/accounts_payable.py +0 -215
- memra-workflows/accounts_payable/accounts_payable_client.py +0 -216
- memra-workflows/accounts_payable/accounts_payable_mcp.py +0 -200
- memra-workflows/accounts_payable/accounts_payable_smart.py +0 -221
- memra-workflows/invoice_processing/invoice_processing.py +0 -116
- memra-workflows/invoice_processing/smart_invoice_processor.py +0 -220
- memra-workflows/logic/__init__.py +0 -1
- memra-workflows/logic/file_tools.py +0 -50
- memra-workflows/logic/invoice_tools.py +0 -501
- memra-workflows/logic/propane_agents.py +0 -52
- memra-workflows/mcp_bridge_server.py +0 -230
- memra-workflows/propane_delivery/propane_delivery.py +0 -87
- memra-workflows/text_to_sql/complete_invoice_workflow_with_queries.py +0 -208
- memra-workflows/text_to_sql/complete_text_to_sql_system.py +0 -266
- memra-workflows/text_to_sql/file_discovery_demo.py +0 -156
- {memra-0.2.13.dist-info → memra-0.2.15.dist-info}/LICENSE +0 -0
- {memra-0.2.13.dist-info → memra-0.2.15.dist-info}/WHEEL +0 -0
- {memra-0.2.13.dist-info → memra-0.2.15.dist-info}/entry_points.txt +0 -0
memra-ops/logic/invoice_tools.py
DELETED
@@ -1,668 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Invoice processing tools for the Memra API server
|
3
|
-
"""
|
4
|
-
|
5
|
-
import os
|
6
|
-
import logging
|
7
|
-
import json
|
8
|
-
import tempfile
|
9
|
-
from typing import Dict, Any, Optional, List
|
10
|
-
import subprocess
|
11
|
-
from PIL import Image
|
12
|
-
import base64
|
13
|
-
import io
|
14
|
-
import uuid
|
15
|
-
from pathlib import Path
|
16
|
-
import requests
|
17
|
-
|
18
|
-
logger = logging.getLogger(__name__)
|
19
|
-
|
20
|
-
class PDFProcessor:
|
21
|
-
"""Process PDF files and extract content using vision model"""
|
22
|
-
|
23
|
-
def __init__(self):
|
24
|
-
self.upload_dir = "/tmp/uploads"
|
25
|
-
self.screenshots_dir = "/tmp/screenshots"
|
26
|
-
# Ensure directories exist
|
27
|
-
os.makedirs(self.screenshots_dir, exist_ok=True)
|
28
|
-
|
29
|
-
def process_pdf(self, file_path: str, schema: Dict[str, Any] = None) -> Dict[str, Any]:
|
30
|
-
"""Process a PDF file and extract invoice data using vision model with schema"""
|
31
|
-
try:
|
32
|
-
if not file_path:
|
33
|
-
return {
|
34
|
-
"success": False,
|
35
|
-
"error": "No file path provided"
|
36
|
-
}
|
37
|
-
|
38
|
-
# Handle uploaded files
|
39
|
-
if file_path.startswith('/uploads/'):
|
40
|
-
full_path = os.path.join(self.upload_dir, os.path.basename(file_path))
|
41
|
-
else:
|
42
|
-
full_path = file_path
|
43
|
-
|
44
|
-
if not os.path.exists(full_path):
|
45
|
-
return {
|
46
|
-
"success": False,
|
47
|
-
"error": f"PDF file not found: {file_path}"
|
48
|
-
}
|
49
|
-
|
50
|
-
logger.info(f"Processing PDF: {file_path}")
|
51
|
-
|
52
|
-
# Step 1: Create invoice-specific directory
|
53
|
-
invoice_id = str(uuid.uuid4())
|
54
|
-
invoice_dir = os.path.join(self.screenshots_dir, invoice_id)
|
55
|
-
os.makedirs(invoice_dir, exist_ok=True)
|
56
|
-
|
57
|
-
# Step 2: Convert PDF pages to screenshots
|
58
|
-
logger.info("Creating screenshots...")
|
59
|
-
screenshot_paths = self._create_screenshots(full_path, invoice_dir)
|
60
|
-
if not screenshot_paths:
|
61
|
-
return {
|
62
|
-
"success": False,
|
63
|
-
"error": "Failed to create screenshots from PDF (timeout or error)"
|
64
|
-
}
|
65
|
-
|
66
|
-
# Step 3: Send screenshots + prompt + schema to vision model
|
67
|
-
logger.info(f"Sending {len(screenshot_paths)} screenshots to vision model with schema...")
|
68
|
-
|
69
|
-
# Construct the comprehensive prompt with schema
|
70
|
-
vision_prompt = self._build_schema_prompt(schema)
|
71
|
-
|
72
|
-
# Log and print the prompt being sent to vision model
|
73
|
-
logger.info(f"Vision Model Prompt: {vision_prompt}")
|
74
|
-
print(f"\n🔎 VISION MODEL PROMPT:")
|
75
|
-
print("=" * 60)
|
76
|
-
print(vision_prompt)
|
77
|
-
print("=" * 60)
|
78
|
-
|
79
|
-
# Send to vision model and get JSON response
|
80
|
-
vision_response = self._call_vision_model_with_schema(screenshot_paths[0], vision_prompt)
|
81
|
-
|
82
|
-
# Log and print the JSON response from vision model
|
83
|
-
logger.info(f"Vision Model JSON Response: {vision_response}")
|
84
|
-
print(f"\n📝 VISION MODEL JSON RESPONSE:")
|
85
|
-
print("=" * 60)
|
86
|
-
print(vision_response)
|
87
|
-
print("=" * 60)
|
88
|
-
|
89
|
-
# Step 4: Parse the JSON response
|
90
|
-
try:
|
91
|
-
# Clean the response - remove markdown code blocks if present
|
92
|
-
cleaned_response = vision_response.strip()
|
93
|
-
if cleaned_response.startswith('```json'):
|
94
|
-
cleaned_response = cleaned_response[7:] # Remove ```json
|
95
|
-
if cleaned_response.endswith('```'):
|
96
|
-
cleaned_response = cleaned_response[:-3] # Remove ```
|
97
|
-
cleaned_response = cleaned_response.strip()
|
98
|
-
|
99
|
-
extracted_data = json.loads(cleaned_response)
|
100
|
-
logger.info(f"Successfully parsed JSON response: {extracted_data}")
|
101
|
-
|
102
|
-
# Convert to MCP bridge expected format
|
103
|
-
mcp_format_data = self._convert_to_mcp_format(extracted_data)
|
104
|
-
|
105
|
-
except json.JSONDecodeError as e:
|
106
|
-
logger.error(f"Failed to parse JSON response: {e}")
|
107
|
-
# If it's an error response, create a structured error
|
108
|
-
if "error" in vision_response.lower():
|
109
|
-
mcp_format_data = {
|
110
|
-
"headerSection": {"vendorName": "", "subtotal": 0},
|
111
|
-
"billingDetails": {"invoiceNumber": "", "invoiceDate": "", "dueDate": ""},
|
112
|
-
"chargesSummary": {"document_total": 0, "secondary_tax": 0, "lineItemsBreakdown": []},
|
113
|
-
"status": "vision_model_error",
|
114
|
-
"error_message": vision_response
|
115
|
-
}
|
116
|
-
else:
|
117
|
-
mcp_format_data = {
|
118
|
-
"headerSection": {"vendorName": "", "subtotal": 0},
|
119
|
-
"billingDetails": {"invoiceNumber": "", "invoiceDate": "", "dueDate": ""},
|
120
|
-
"chargesSummary": {"document_total": 0, "secondary_tax": 0, "lineItemsBreakdown": []},
|
121
|
-
"status": "json_parse_error",
|
122
|
-
"raw_response": vision_response
|
123
|
-
}
|
124
|
-
|
125
|
-
return {
|
126
|
-
"success": True,
|
127
|
-
"data": {
|
128
|
-
"file_path": file_path,
|
129
|
-
"invoice_id": invoice_id,
|
130
|
-
"screenshots_dir": invoice_dir,
|
131
|
-
"screenshot_count": len(screenshot_paths),
|
132
|
-
"vision_prompt": vision_prompt,
|
133
|
-
"vision_response": vision_response,
|
134
|
-
"extracted_data": mcp_format_data
|
135
|
-
}
|
136
|
-
}
|
137
|
-
|
138
|
-
except Exception as e:
|
139
|
-
logger.error(f"PDF processing failed: {str(e)}")
|
140
|
-
return {
|
141
|
-
"success": False,
|
142
|
-
"error": str(e)
|
143
|
-
}
|
144
|
-
|
145
|
-
def _create_screenshots(self, pdf_path: str, output_dir: str) -> List[str]:
|
146
|
-
"""Create high-resolution screenshots of PDF pages"""
|
147
|
-
try:
|
148
|
-
# Use pdftoppm to convert PDF to images with lower resolution for speed
|
149
|
-
cmd = [
|
150
|
-
'pdftoppm',
|
151
|
-
'-png', # Output format
|
152
|
-
'-r', '100', # Very low resolution (100 DPI) for maximum speed
|
153
|
-
'-cropbox', # Use crop box for consistent sizing
|
154
|
-
'-f', '1', # Start from page 1
|
155
|
-
'-l', '1', # Only process first page for speed
|
156
|
-
pdf_path, # Input PDF
|
157
|
-
os.path.join(output_dir, 'page') # Output prefix
|
158
|
-
]
|
159
|
-
|
160
|
-
# Add timeout to prevent hanging
|
161
|
-
result = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
|
162
|
-
if result.returncode != 0:
|
163
|
-
logger.error(f"pdftoppm failed: {result.stderr}")
|
164
|
-
return []
|
165
|
-
|
166
|
-
# Find generated image files
|
167
|
-
screenshot_paths = []
|
168
|
-
for file in sorted(os.listdir(output_dir)):
|
169
|
-
if file.endswith('.png'):
|
170
|
-
image_path = os.path.join(output_dir, file)
|
171
|
-
screenshot_paths.append(image_path)
|
172
|
-
|
173
|
-
logger.info(f"Created {len(screenshot_paths)} screenshots in {output_dir}")
|
174
|
-
return screenshot_paths
|
175
|
-
|
176
|
-
except subprocess.TimeoutExpired:
|
177
|
-
logger.error(f"Screenshot creation timed out after 15 seconds")
|
178
|
-
return []
|
179
|
-
except Exception as e:
|
180
|
-
logger.error(f"Screenshot creation failed: {str(e)}")
|
181
|
-
return []
|
182
|
-
|
183
|
-
def _build_schema_prompt(self, schema: Dict[str, Any]) -> str:
|
184
|
-
"""Build a prompt that includes the database schema"""
|
185
|
-
|
186
|
-
logger.info(f"_build_schema_prompt called with schema type: {type(schema)}")
|
187
|
-
logger.info(f"Schema content: {schema}")
|
188
|
-
|
189
|
-
# Default base prompt with essential fields
|
190
|
-
base_prompt = '''Extract invoice data from this image and return ONLY a JSON object with these specific fields:
|
191
|
-
- vendor_name: The company name at the top of the invoice
|
192
|
-
- invoice_number: The invoice number or ID
|
193
|
-
- invoice_date: The date the invoice was issued (YYYY-MM-DD format)
|
194
|
-
- total_amount: The total invoice amount
|
195
|
-
- line_items: Array of items with descriptions and amounts
|
196
|
-
|
197
|
-
Look specifically for the company/vendor name prominently displayed on the invoice.
|
198
|
-
|
199
|
-
Return ONLY valid JSON with no additional text or explanation.'''
|
200
|
-
|
201
|
-
# If no schema provided, return the base prompt
|
202
|
-
if not schema:
|
203
|
-
logger.info("No schema provided, returning base prompt")
|
204
|
-
return base_prompt
|
205
|
-
|
206
|
-
# Handle different schema formats
|
207
|
-
columns = None
|
208
|
-
if isinstance(schema, list):
|
209
|
-
# Client sends array of column objects directly
|
210
|
-
columns = schema
|
211
|
-
elif isinstance(schema, dict) and "columns" in schema:
|
212
|
-
# Standard format with columns array
|
213
|
-
columns = schema["columns"]
|
214
|
-
else:
|
215
|
-
# Unknown format, use base prompt
|
216
|
-
return base_prompt
|
217
|
-
|
218
|
-
# Build field descriptions from schema
|
219
|
-
field_descriptions = []
|
220
|
-
logger.info(f"Building prompt from {len(columns)} columns")
|
221
|
-
for col in columns:
|
222
|
-
# Handle both formats: {"column_name": "x"} and {"name": "x"}
|
223
|
-
name = col.get("column_name") or col.get("name", "")
|
224
|
-
col_type = col.get("data_type") or col.get("type", "")
|
225
|
-
logger.info(f"Processing column: {name} ({col_type})")
|
226
|
-
|
227
|
-
# Skip system fields
|
228
|
-
if name and name not in ["id", "created_at", "updated_at", "status", "raw_json"]:
|
229
|
-
# Add helpful descriptions for key fields
|
230
|
-
if name == "vendor_name":
|
231
|
-
field_descriptions.append(f"- {name}: The company name at the top of the invoice")
|
232
|
-
elif name == "invoice_number":
|
233
|
-
field_descriptions.append(f"- {name}: The invoice number or ID")
|
234
|
-
elif name == "invoice_date":
|
235
|
-
field_descriptions.append(f"- {name}: The date the invoice was issued (YYYY-MM-DD format)")
|
236
|
-
elif name == "total_amount":
|
237
|
-
field_descriptions.append(f"- {name}: The total invoice amount")
|
238
|
-
elif name == "due_date":
|
239
|
-
field_descriptions.append(f"- {name}: The invoice due date (YYYY-MM-DD format)")
|
240
|
-
elif name == "tax_amount":
|
241
|
-
field_descriptions.append(f"- {name}: The tax amount on the invoice")
|
242
|
-
elif name == "line_items":
|
243
|
-
field_descriptions.append(f"- {name}: Array of items with descriptions and amounts")
|
244
|
-
else:
|
245
|
-
field_descriptions.append(f"- {name}: {col_type}")
|
246
|
-
|
247
|
-
# If we have field descriptions, use them; otherwise use base prompt
|
248
|
-
logger.info(f"Built {len(field_descriptions)} field descriptions")
|
249
|
-
if field_descriptions:
|
250
|
-
schema_text = "\n".join(field_descriptions)
|
251
|
-
full_prompt = f'''Extract invoice data from this image and return ONLY a JSON object with these specific fields:
|
252
|
-
{schema_text}
|
253
|
-
|
254
|
-
Look specifically for the company/vendor name prominently displayed on the invoice.
|
255
|
-
|
256
|
-
Return ONLY valid JSON with no additional text or explanation.'''
|
257
|
-
logger.info(f"Returning schema-based prompt with {len(field_descriptions)} fields")
|
258
|
-
return full_prompt
|
259
|
-
else:
|
260
|
-
logger.info("No field descriptions built, returning base prompt")
|
261
|
-
return base_prompt
|
262
|
-
|
263
|
-
def _call_vision_model_with_schema(self, image_path: str, prompt: str) -> str:
|
264
|
-
"""Call vision model with image and comprehensive prompt using Hugging Face"""
|
265
|
-
try:
|
266
|
-
# Import Hugging Face client
|
267
|
-
from huggingface_hub import InferenceClient
|
268
|
-
|
269
|
-
# Get API key from environment
|
270
|
-
api_key = os.getenv("HUGGINGFACE_API_KEY")
|
271
|
-
|
272
|
-
if not api_key:
|
273
|
-
logger.error("HUGGINGFACE_API_KEY environment variable is not set")
|
274
|
-
return json.dumps({
|
275
|
-
"error": "Hugging Face API key not configured",
|
276
|
-
"message": "Please set HUGGINGFACE_API_KEY environment variable",
|
277
|
-
"expected_structure": {
|
278
|
-
"vendor_name": "string",
|
279
|
-
"invoice_number": "string",
|
280
|
-
"invoice_date": "YYYY-MM-DD",
|
281
|
-
"due_date": "YYYY-MM-DD",
|
282
|
-
"amount": 0.0,
|
283
|
-
"tax_amount": 0.0,
|
284
|
-
"line_items": "[]"
|
285
|
-
}
|
286
|
-
})
|
287
|
-
|
288
|
-
# Log the API key format for debugging (first few characters)
|
289
|
-
logger.info(f"Using Hugging Face API key: {api_key[:10]}...")
|
290
|
-
|
291
|
-
# Create Hugging Face client with correct parameter name
|
292
|
-
client = InferenceClient(token=api_key)
|
293
|
-
|
294
|
-
# Encode image to base64
|
295
|
-
def encode_image(image_path):
|
296
|
-
with open(image_path, "rb") as f:
|
297
|
-
return base64.b64encode(f.read()).decode("utf-8")
|
298
|
-
|
299
|
-
base64_image = encode_image(image_path)
|
300
|
-
|
301
|
-
# Log the request details for debugging
|
302
|
-
logger.info(f"Making request to Hugging Face with model: meta-llama/Llama-4-Maverick-17B-128E-Instruct")
|
303
|
-
logger.info(f"Prompt length: {len(prompt)} characters")
|
304
|
-
logger.info(f"Image base64 length: {len(base64_image)} characters")
|
305
|
-
|
306
|
-
# Call the model using the working approach - exactly as in your example
|
307
|
-
response = client.chat.completions.create(
|
308
|
-
model="meta-llama/Llama-4-Maverick-17B-128E-Instruct",
|
309
|
-
messages=[
|
310
|
-
{
|
311
|
-
"role": "user",
|
312
|
-
"content": [
|
313
|
-
{"type": "text", "text": prompt},
|
314
|
-
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
|
315
|
-
]
|
316
|
-
}
|
317
|
-
],
|
318
|
-
max_tokens=500,
|
319
|
-
)
|
320
|
-
|
321
|
-
# Extract the response content
|
322
|
-
extracted_text = response.choices[0].message.content
|
323
|
-
|
324
|
-
logger.info(f"Hugging Face API call successful")
|
325
|
-
logger.info(f"Response length: {len(extracted_text)} characters")
|
326
|
-
|
327
|
-
return extracted_text
|
328
|
-
|
329
|
-
except Exception as e:
|
330
|
-
logger.error(f"Vision model call failed: {str(e)}")
|
331
|
-
return json.dumps({
|
332
|
-
"error": f"Vision model processing failed - {str(e)}"
|
333
|
-
})
|
334
|
-
|
335
|
-
def _convert_to_mcp_format(self, extracted_data: Dict[str, Any]) -> Dict[str, Any]:
|
336
|
-
"""Convert extracted data to MCP bridge expected format"""
|
337
|
-
try:
|
338
|
-
# Handle nested response structure from vision model
|
339
|
-
# The vision model might return data in a nested structure like:
|
340
|
-
# {"data": {"invoice_number": "123", "vendor_name": "ABC Corp"}}
|
341
|
-
# or directly: {"invoice_number": "123", "vendor_name": "ABC Corp"}
|
342
|
-
|
343
|
-
# If the data is nested, extract it
|
344
|
-
if isinstance(extracted_data, dict) and "data" in extracted_data:
|
345
|
-
actual_data = extracted_data["data"]
|
346
|
-
else:
|
347
|
-
actual_data = extracted_data
|
348
|
-
|
349
|
-
# Handle both expected format and actual vision model output format
|
350
|
-
# Vision model might return: InvoiceNumber, InvoiceDate, InvoiceTotal, etc.
|
351
|
-
# Expected format: invoice_number, invoice_date, amount, etc.
|
352
|
-
|
353
|
-
# Extract invoice number (try both formats)
|
354
|
-
invoice_number = (
|
355
|
-
actual_data.get("invoice_number") or
|
356
|
-
actual_data.get("InvoiceNumber") or
|
357
|
-
actual_data.get("invoiceNumber") or
|
358
|
-
""
|
359
|
-
)
|
360
|
-
|
361
|
-
# Extract invoice date (try both formats)
|
362
|
-
invoice_date = (
|
363
|
-
actual_data.get("invoice_date") or
|
364
|
-
actual_data.get("InvoiceDate") or
|
365
|
-
actual_data.get("invoiceDate") or
|
366
|
-
""
|
367
|
-
)
|
368
|
-
|
369
|
-
# Convert date format if needed
|
370
|
-
if invoice_date:
|
371
|
-
# Convert MM/DD/YY to YYYY-MM-DD format
|
372
|
-
if "/" in invoice_date and len(invoice_date.split("/")) == 3:
|
373
|
-
parts = invoice_date.split("/")
|
374
|
-
month, day, year = parts[0], parts[1], parts[2]
|
375
|
-
if len(year) == 2:
|
376
|
-
year = "20" + year
|
377
|
-
invoice_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
|
378
|
-
|
379
|
-
# Extract due date (try both formats)
|
380
|
-
due_date = (
|
381
|
-
actual_data.get("due_date") or
|
382
|
-
actual_data.get("DueDate") or
|
383
|
-
actual_data.get("dueDate") or
|
384
|
-
""
|
385
|
-
)
|
386
|
-
|
387
|
-
# Convert due date format if needed
|
388
|
-
if due_date:
|
389
|
-
# Convert MM/DD/YY to YYYY-MM-DD format
|
390
|
-
if "/" in due_date and len(due_date.split("/")) == 3:
|
391
|
-
parts = due_date.split("/")
|
392
|
-
month, day, year = parts[0], parts[1], parts[2]
|
393
|
-
if len(year) == 2:
|
394
|
-
year = "20" + year
|
395
|
-
due_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
|
396
|
-
|
397
|
-
# Extract amount (try both formats)
|
398
|
-
amount = (
|
399
|
-
actual_data.get("total_amount") or # Add this - matches the prompt
|
400
|
-
actual_data.get("amount") or
|
401
|
-
actual_data.get("InvoiceTotal") or
|
402
|
-
actual_data.get("invoiceTotal") or
|
403
|
-
actual_data.get("total") or
|
404
|
-
0
|
405
|
-
)
|
406
|
-
|
407
|
-
# Extract vendor name (try both formats)
|
408
|
-
vendor_name = (
|
409
|
-
actual_data.get("vendor_name") or
|
410
|
-
actual_data.get("VendorName") or
|
411
|
-
actual_data.get("vendorName") or
|
412
|
-
actual_data.get("Company") or
|
413
|
-
actual_data.get("company") or
|
414
|
-
""
|
415
|
-
)
|
416
|
-
|
417
|
-
# Extract tax amount (try both formats)
|
418
|
-
tax_amount = (
|
419
|
-
actual_data.get("tax_amount") or
|
420
|
-
actual_data.get("TaxAmount") or
|
421
|
-
actual_data.get("taxAmount") or
|
422
|
-
0
|
423
|
-
)
|
424
|
-
|
425
|
-
# Extract line items (try both formats)
|
426
|
-
line_items = (
|
427
|
-
actual_data.get("line_items") or
|
428
|
-
actual_data.get("Order") or
|
429
|
-
actual_data.get("order") or
|
430
|
-
actual_data.get("LineItems") or
|
431
|
-
actual_data.get("lineItems") or
|
432
|
-
[]
|
433
|
-
)
|
434
|
-
|
435
|
-
if isinstance(line_items, str):
|
436
|
-
try:
|
437
|
-
line_items = json.loads(line_items)
|
438
|
-
except:
|
439
|
-
line_items = []
|
440
|
-
|
441
|
-
# Convert to MCP bridge format
|
442
|
-
mcp_format = {
|
443
|
-
"headerSection": {
|
444
|
-
"vendorName": vendor_name,
|
445
|
-
"subtotal": float(amount)
|
446
|
-
},
|
447
|
-
"billingDetails": {
|
448
|
-
"invoiceNumber": invoice_number,
|
449
|
-
"invoiceDate": invoice_date,
|
450
|
-
"dueDate": due_date
|
451
|
-
},
|
452
|
-
"chargesSummary": {
|
453
|
-
"document_total": float(amount),
|
454
|
-
"secondary_tax": float(tax_amount),
|
455
|
-
"lineItemsBreakdown": line_items
|
456
|
-
},
|
457
|
-
"status": "processed"
|
458
|
-
}
|
459
|
-
|
460
|
-
return mcp_format
|
461
|
-
|
462
|
-
except Exception as e:
|
463
|
-
logger.error(f"Error converting to MCP format: {str(e)}")
|
464
|
-
return {
|
465
|
-
"headerSection": {"vendorName": "", "subtotal": 0},
|
466
|
-
"billingDetails": {"invoiceNumber": "", "invoiceDate": "", "dueDate": ""},
|
467
|
-
"chargesSummary": {"document_total": 0, "secondary_tax": 0, "lineItemsBreakdown": []},
|
468
|
-
"status": "conversion_error"
|
469
|
-
}
|
470
|
-
|
471
|
-
class DatabaseQueryTool:
|
472
|
-
"""Query database schemas and data"""
|
473
|
-
|
474
|
-
def __init__(self, credentials: Dict[str, Any]):
|
475
|
-
self.credentials = credentials
|
476
|
-
|
477
|
-
def get_schema(self, table_name: str) -> Dict[str, Any]:
|
478
|
-
"""Get database schema for a table"""
|
479
|
-
# Mock schema for now
|
480
|
-
return {
|
481
|
-
"success": True,
|
482
|
-
"data": {
|
483
|
-
"table": table_name,
|
484
|
-
"columns": [
|
485
|
-
{"name": "id", "type": "integer", "primary_key": True},
|
486
|
-
{"name": "vendor_name", "type": "text"},
|
487
|
-
{"name": "invoice_number", "type": "text"},
|
488
|
-
{"name": "invoice_date", "type": "date"},
|
489
|
-
{"name": "amount", "type": "decimal"},
|
490
|
-
{"name": "created_at", "type": "timestamp"}
|
491
|
-
]
|
492
|
-
}
|
493
|
-
}
|
494
|
-
|
495
|
-
class OCRTool:
|
496
|
-
"""Perform OCR on images and documents"""
|
497
|
-
|
498
|
-
def extract_text(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
|
499
|
-
"""Extract text from document"""
|
500
|
-
return {
|
501
|
-
"success": True,
|
502
|
-
"data": {
|
503
|
-
"extracted_text": "Sample extracted text from document"
|
504
|
-
}
|
505
|
-
}
|
506
|
-
|
507
|
-
class InvoiceExtractionWorkflow:
|
508
|
-
"""Extract structured data from invoices"""
|
509
|
-
|
510
|
-
def __init__(self):
|
511
|
-
pass
|
512
|
-
|
513
|
-
def extract_data(self, text: str, schema: Dict[str, Any]) -> Dict[str, Any]:
|
514
|
-
"""Extract structured data from invoice text or JSON"""
|
515
|
-
try:
|
516
|
-
# Check if the input is already JSON (from vision model)
|
517
|
-
if isinstance(text, dict):
|
518
|
-
# Input is already structured data from vision model
|
519
|
-
invoice_data = text
|
520
|
-
else:
|
521
|
-
# Try to parse as JSON first
|
522
|
-
try:
|
523
|
-
invoice_data = json.loads(text)
|
524
|
-
except json.JSONDecodeError:
|
525
|
-
# Fall back to text parsing
|
526
|
-
invoice_data = self._parse_text_to_data(text)
|
527
|
-
|
528
|
-
# If we still have empty data, this might be a case where we should
|
529
|
-
# use the data from a previous tool (PDFProcessor)
|
530
|
-
if not invoice_data.get("vendor_name") and not invoice_data.get("invoice_number"):
|
531
|
-
logger.warning("No invoice data found in input - this might be a workflow issue")
|
532
|
-
return {
|
533
|
-
"success": False,
|
534
|
-
"data": {
|
535
|
-
"extracted_data": {
|
536
|
-
"vendor_name": "",
|
537
|
-
"invoice_number": "",
|
538
|
-
"invoice_date": "",
|
539
|
-
"amount": 0.0,
|
540
|
-
"tax_amount": 0.0,
|
541
|
-
"line_items": "[]",
|
542
|
-
"status": "no_data_from_previous_tool"
|
543
|
-
}
|
544
|
-
}
|
545
|
-
}
|
546
|
-
|
547
|
-
# Convert date format if needed
|
548
|
-
if invoice_data.get("invoice_date"):
|
549
|
-
invoice_data["invoice_date"] = self._convert_date_format(invoice_data["invoice_date"])
|
550
|
-
|
551
|
-
# Ensure line_items is a JSON string
|
552
|
-
if isinstance(invoice_data.get("line_items"), list):
|
553
|
-
invoice_data["line_items"] = json.dumps(invoice_data["line_items"])
|
554
|
-
|
555
|
-
# Set status
|
556
|
-
invoice_data["status"] = "processed"
|
557
|
-
|
558
|
-
return {
|
559
|
-
"success": True,
|
560
|
-
"data": {
|
561
|
-
"extracted_data": invoice_data
|
562
|
-
}
|
563
|
-
}
|
564
|
-
|
565
|
-
except Exception as e:
|
566
|
-
logger.error(f"Invoice extraction failed: {str(e)}")
|
567
|
-
return {
|
568
|
-
"success": False,
|
569
|
-
"data": {
|
570
|
-
"extracted_data": {
|
571
|
-
"vendor_name": "",
|
572
|
-
"invoice_number": "",
|
573
|
-
"invoice_date": "",
|
574
|
-
"amount": 0.0,
|
575
|
-
"tax_amount": 0.0,
|
576
|
-
"line_items": "[]",
|
577
|
-
"status": "error"
|
578
|
-
}
|
579
|
-
}
|
580
|
-
}
|
581
|
-
|
582
|
-
def _parse_text_to_data(self, text: str) -> Dict[str, Any]:
|
583
|
-
"""Parse text to extract invoice data (fallback method)"""
|
584
|
-
lines = text.split('\n')
|
585
|
-
invoice_data = {
|
586
|
-
"vendor_name": "",
|
587
|
-
"invoice_number": "",
|
588
|
-
"invoice_date": "",
|
589
|
-
"amount": 0.0,
|
590
|
-
"tax_amount": 0.0,
|
591
|
-
"line_items": "[]",
|
592
|
-
"status": "processed"
|
593
|
-
}
|
594
|
-
|
595
|
-
# Extract data from the text using real parsing
|
596
|
-
for line in lines:
|
597
|
-
line = line.strip()
|
598
|
-
if "Invoice Number:" in line:
|
599
|
-
invoice_data["invoice_number"] = line.split(":")[1].strip()
|
600
|
-
elif "Invoice Date:" in line:
|
601
|
-
invoice_data["invoice_date"] = line.split(":")[1].strip()
|
602
|
-
elif "Order total:" in line:
|
603
|
-
amount_str = line.split(":")[1].strip()
|
604
|
-
try:
|
605
|
-
invoice_data["amount"] = float(amount_str)
|
606
|
-
except:
|
607
|
-
pass
|
608
|
-
elif "GST - HST / TPS -TVH:" in line:
|
609
|
-
tax_str = line.split(":")[1].strip()
|
610
|
-
try:
|
611
|
-
invoice_data["tax_amount"] = float(tax_str)
|
612
|
-
except:
|
613
|
-
pass
|
614
|
-
elif "SUPERIOR PROPANE" in line:
|
615
|
-
invoice_data["vendor_name"] = "SUPERIOR PROPANE"
|
616
|
-
elif "CHEP CANADA INC" in line:
|
617
|
-
invoice_data["vendor_name"] = "CHEP CANADA INC"
|
618
|
-
|
619
|
-
return invoice_data
|
620
|
-
|
621
|
-
def _convert_date_format(self, date_str: str) -> str:
|
622
|
-
"""Convert date from MM/DD/YY to YYYY-MM-DD format"""
|
623
|
-
try:
|
624
|
-
# Handle MM/DD/YY format
|
625
|
-
if "/" in date_str and len(date_str.split("/")) == 3:
|
626
|
-
parts = date_str.split("/")
|
627
|
-
month, day, year = parts[0], parts[1], parts[2]
|
628
|
-
|
629
|
-
# Convert 2-digit year to 4-digit
|
630
|
-
if len(year) == 2:
|
631
|
-
year = "20" + year
|
632
|
-
|
633
|
-
# Ensure proper formatting
|
634
|
-
return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
|
635
|
-
|
636
|
-
return date_str
|
637
|
-
except:
|
638
|
-
return date_str
|
639
|
-
|
640
|
-
class DataValidator:
|
641
|
-
"""Validate data against schemas"""
|
642
|
-
|
643
|
-
def validate(self, data: Dict[str, Any], schema: Dict[str, Any]) -> Dict[str, Any]:
|
644
|
-
"""Validate data against schema"""
|
645
|
-
return {
|
646
|
-
"success": True,
|
647
|
-
"data": {
|
648
|
-
"valid": True,
|
649
|
-
"errors": []
|
650
|
-
}
|
651
|
-
}
|
652
|
-
|
653
|
-
class PostgresInsert:
|
654
|
-
"""Insert data into PostgreSQL database"""
|
655
|
-
|
656
|
-
def __init__(self, credentials: Dict[str, Any]):
|
657
|
-
self.credentials = credentials
|
658
|
-
|
659
|
-
def insert_record(self, table: str, data: Dict[str, Any]) -> Dict[str, Any]:
|
660
|
-
"""Insert a record into the database"""
|
661
|
-
return {
|
662
|
-
"success": True,
|
663
|
-
"data": {
|
664
|
-
"table": table,
|
665
|
-
"inserted_id": 123,
|
666
|
-
"message": "Record inserted successfully"
|
667
|
-
}
|
668
|
-
}
|