memra 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. memra/cli.py +322 -51
  2. {memra-0.2.13.dist-info → memra-0.2.15.dist-info}/METADATA +1 -1
  3. {memra-0.2.13.dist-info → memra-0.2.15.dist-info}/RECORD +7 -61
  4. memra-0.2.15.dist-info/top_level.txt +1 -0
  5. memra-0.2.13.dist-info/top_level.txt +0 -4
  6. memra-ops/app.py +0 -808
  7. memra-ops/config/config.py +0 -25
  8. memra-ops/config.py +0 -34
  9. memra-ops/logic/__init__.py +0 -1
  10. memra-ops/logic/file_tools.py +0 -43
  11. memra-ops/logic/invoice_tools.py +0 -668
  12. memra-ops/logic/invoice_tools_fix.py +0 -66
  13. memra-ops/mcp_bridge_server.py +0 -1178
  14. memra-ops/scripts/check_database.py +0 -37
  15. memra-ops/scripts/clear_database.py +0 -48
  16. memra-ops/scripts/monitor_database.py +0 -67
  17. memra-ops/scripts/release.py +0 -133
  18. memra-ops/scripts/reset_database.py +0 -65
  19. memra-ops/scripts/start_memra.py +0 -334
  20. memra-ops/scripts/stop_memra.py +0 -132
  21. memra-ops/server_tool_registry.py +0 -190
  22. memra-ops/tests/test_llm_text_to_sql.py +0 -115
  23. memra-ops/tests/test_llm_vs_pattern.py +0 -130
  24. memra-ops/tests/test_mcp_schema_aware.py +0 -124
  25. memra-ops/tests/test_schema_aware_sql.py +0 -139
  26. memra-ops/tests/test_schema_aware_sql_simple.py +0 -66
  27. memra-ops/tests/test_text_to_sql_demo.py +0 -140
  28. memra-ops/tools/mcp_bridge_server.py +0 -851
  29. memra-sdk/examples/accounts_payable.py +0 -215
  30. memra-sdk/examples/accounts_payable_client.py +0 -217
  31. memra-sdk/examples/accounts_payable_mcp.py +0 -200
  32. memra-sdk/examples/ask_questions.py +0 -123
  33. memra-sdk/examples/invoice_processing.py +0 -116
  34. memra-sdk/examples/propane_delivery.py +0 -87
  35. memra-sdk/examples/simple_text_to_sql.py +0 -158
  36. memra-sdk/memra/__init__.py +0 -31
  37. memra-sdk/memra/discovery.py +0 -15
  38. memra-sdk/memra/discovery_client.py +0 -49
  39. memra-sdk/memra/execution.py +0 -481
  40. memra-sdk/memra/models.py +0 -99
  41. memra-sdk/memra/tool_registry.py +0 -343
  42. memra-sdk/memra/tool_registry_client.py +0 -106
  43. memra-sdk/scripts/release.py +0 -133
  44. memra-sdk/setup.py +0 -52
  45. memra-workflows/accounts_payable/accounts_payable.py +0 -215
  46. memra-workflows/accounts_payable/accounts_payable_client.py +0 -216
  47. memra-workflows/accounts_payable/accounts_payable_mcp.py +0 -200
  48. memra-workflows/accounts_payable/accounts_payable_smart.py +0 -221
  49. memra-workflows/invoice_processing/invoice_processing.py +0 -116
  50. memra-workflows/invoice_processing/smart_invoice_processor.py +0 -220
  51. memra-workflows/logic/__init__.py +0 -1
  52. memra-workflows/logic/file_tools.py +0 -50
  53. memra-workflows/logic/invoice_tools.py +0 -501
  54. memra-workflows/logic/propane_agents.py +0 -52
  55. memra-workflows/mcp_bridge_server.py +0 -230
  56. memra-workflows/propane_delivery/propane_delivery.py +0 -87
  57. memra-workflows/text_to_sql/complete_invoice_workflow_with_queries.py +0 -208
  58. memra-workflows/text_to_sql/complete_text_to_sql_system.py +0 -266
  59. memra-workflows/text_to_sql/file_discovery_demo.py +0 -156
  60. {memra-0.2.13.dist-info → memra-0.2.15.dist-info}/LICENSE +0 -0
  61. {memra-0.2.13.dist-info → memra-0.2.15.dist-info}/WHEEL +0 -0
  62. {memra-0.2.13.dist-info → memra-0.2.15.dist-info}/entry_points.txt +0 -0
@@ -1,668 +0,0 @@
1
- """
2
- Invoice processing tools for the Memra API server
3
- """
4
-
5
- import os
6
- import logging
7
- import json
8
- import tempfile
9
- from typing import Dict, Any, Optional, List
10
- import subprocess
11
- from PIL import Image
12
- import base64
13
- import io
14
- import uuid
15
- from pathlib import Path
16
- import requests
17
-
18
- logger = logging.getLogger(__name__)
19
-
20
- class PDFProcessor:
21
- """Process PDF files and extract content using vision model"""
22
-
23
- def __init__(self):
24
- self.upload_dir = "/tmp/uploads"
25
- self.screenshots_dir = "/tmp/screenshots"
26
- # Ensure directories exist
27
- os.makedirs(self.screenshots_dir, exist_ok=True)
28
-
29
- def process_pdf(self, file_path: str, schema: Dict[str, Any] = None) -> Dict[str, Any]:
30
- """Process a PDF file and extract invoice data using vision model with schema"""
31
- try:
32
- if not file_path:
33
- return {
34
- "success": False,
35
- "error": "No file path provided"
36
- }
37
-
38
- # Handle uploaded files
39
- if file_path.startswith('/uploads/'):
40
- full_path = os.path.join(self.upload_dir, os.path.basename(file_path))
41
- else:
42
- full_path = file_path
43
-
44
- if not os.path.exists(full_path):
45
- return {
46
- "success": False,
47
- "error": f"PDF file not found: {file_path}"
48
- }
49
-
50
- logger.info(f"Processing PDF: {file_path}")
51
-
52
- # Step 1: Create invoice-specific directory
53
- invoice_id = str(uuid.uuid4())
54
- invoice_dir = os.path.join(self.screenshots_dir, invoice_id)
55
- os.makedirs(invoice_dir, exist_ok=True)
56
-
57
- # Step 2: Convert PDF pages to screenshots
58
- logger.info("Creating screenshots...")
59
- screenshot_paths = self._create_screenshots(full_path, invoice_dir)
60
- if not screenshot_paths:
61
- return {
62
- "success": False,
63
- "error": "Failed to create screenshots from PDF (timeout or error)"
64
- }
65
-
66
- # Step 3: Send screenshots + prompt + schema to vision model
67
- logger.info(f"Sending {len(screenshot_paths)} screenshots to vision model with schema...")
68
-
69
- # Construct the comprehensive prompt with schema
70
- vision_prompt = self._build_schema_prompt(schema)
71
-
72
- # Log and print the prompt being sent to vision model
73
- logger.info(f"Vision Model Prompt: {vision_prompt}")
74
- print(f"\n🔎 VISION MODEL PROMPT:")
75
- print("=" * 60)
76
- print(vision_prompt)
77
- print("=" * 60)
78
-
79
- # Send to vision model and get JSON response
80
- vision_response = self._call_vision_model_with_schema(screenshot_paths[0], vision_prompt)
81
-
82
- # Log and print the JSON response from vision model
83
- logger.info(f"Vision Model JSON Response: {vision_response}")
84
- print(f"\n📝 VISION MODEL JSON RESPONSE:")
85
- print("=" * 60)
86
- print(vision_response)
87
- print("=" * 60)
88
-
89
- # Step 4: Parse the JSON response
90
- try:
91
- # Clean the response - remove markdown code blocks if present
92
- cleaned_response = vision_response.strip()
93
- if cleaned_response.startswith('```json'):
94
- cleaned_response = cleaned_response[7:] # Remove ```json
95
- if cleaned_response.endswith('```'):
96
- cleaned_response = cleaned_response[:-3] # Remove ```
97
- cleaned_response = cleaned_response.strip()
98
-
99
- extracted_data = json.loads(cleaned_response)
100
- logger.info(f"Successfully parsed JSON response: {extracted_data}")
101
-
102
- # Convert to MCP bridge expected format
103
- mcp_format_data = self._convert_to_mcp_format(extracted_data)
104
-
105
- except json.JSONDecodeError as e:
106
- logger.error(f"Failed to parse JSON response: {e}")
107
- # If it's an error response, create a structured error
108
- if "error" in vision_response.lower():
109
- mcp_format_data = {
110
- "headerSection": {"vendorName": "", "subtotal": 0},
111
- "billingDetails": {"invoiceNumber": "", "invoiceDate": "", "dueDate": ""},
112
- "chargesSummary": {"document_total": 0, "secondary_tax": 0, "lineItemsBreakdown": []},
113
- "status": "vision_model_error",
114
- "error_message": vision_response
115
- }
116
- else:
117
- mcp_format_data = {
118
- "headerSection": {"vendorName": "", "subtotal": 0},
119
- "billingDetails": {"invoiceNumber": "", "invoiceDate": "", "dueDate": ""},
120
- "chargesSummary": {"document_total": 0, "secondary_tax": 0, "lineItemsBreakdown": []},
121
- "status": "json_parse_error",
122
- "raw_response": vision_response
123
- }
124
-
125
- return {
126
- "success": True,
127
- "data": {
128
- "file_path": file_path,
129
- "invoice_id": invoice_id,
130
- "screenshots_dir": invoice_dir,
131
- "screenshot_count": len(screenshot_paths),
132
- "vision_prompt": vision_prompt,
133
- "vision_response": vision_response,
134
- "extracted_data": mcp_format_data
135
- }
136
- }
137
-
138
- except Exception as e:
139
- logger.error(f"PDF processing failed: {str(e)}")
140
- return {
141
- "success": False,
142
- "error": str(e)
143
- }
144
-
145
- def _create_screenshots(self, pdf_path: str, output_dir: str) -> List[str]:
146
- """Create high-resolution screenshots of PDF pages"""
147
- try:
148
- # Use pdftoppm to convert PDF to images with lower resolution for speed
149
- cmd = [
150
- 'pdftoppm',
151
- '-png', # Output format
152
- '-r', '100', # Very low resolution (100 DPI) for maximum speed
153
- '-cropbox', # Use crop box for consistent sizing
154
- '-f', '1', # Start from page 1
155
- '-l', '1', # Only process first page for speed
156
- pdf_path, # Input PDF
157
- os.path.join(output_dir, 'page') # Output prefix
158
- ]
159
-
160
- # Add timeout to prevent hanging
161
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
162
- if result.returncode != 0:
163
- logger.error(f"pdftoppm failed: {result.stderr}")
164
- return []
165
-
166
- # Find generated image files
167
- screenshot_paths = []
168
- for file in sorted(os.listdir(output_dir)):
169
- if file.endswith('.png'):
170
- image_path = os.path.join(output_dir, file)
171
- screenshot_paths.append(image_path)
172
-
173
- logger.info(f"Created {len(screenshot_paths)} screenshots in {output_dir}")
174
- return screenshot_paths
175
-
176
- except subprocess.TimeoutExpired:
177
- logger.error(f"Screenshot creation timed out after 15 seconds")
178
- return []
179
- except Exception as e:
180
- logger.error(f"Screenshot creation failed: {str(e)}")
181
- return []
182
-
183
- def _build_schema_prompt(self, schema: Dict[str, Any]) -> str:
184
- """Build a prompt that includes the database schema"""
185
-
186
- logger.info(f"_build_schema_prompt called with schema type: {type(schema)}")
187
- logger.info(f"Schema content: {schema}")
188
-
189
- # Default base prompt with essential fields
190
- base_prompt = '''Extract invoice data from this image and return ONLY a JSON object with these specific fields:
191
- - vendor_name: The company name at the top of the invoice
192
- - invoice_number: The invoice number or ID
193
- - invoice_date: The date the invoice was issued (YYYY-MM-DD format)
194
- - total_amount: The total invoice amount
195
- - line_items: Array of items with descriptions and amounts
196
-
197
- Look specifically for the company/vendor name prominently displayed on the invoice.
198
-
199
- Return ONLY valid JSON with no additional text or explanation.'''
200
-
201
- # If no schema provided, return the base prompt
202
- if not schema:
203
- logger.info("No schema provided, returning base prompt")
204
- return base_prompt
205
-
206
- # Handle different schema formats
207
- columns = None
208
- if isinstance(schema, list):
209
- # Client sends array of column objects directly
210
- columns = schema
211
- elif isinstance(schema, dict) and "columns" in schema:
212
- # Standard format with columns array
213
- columns = schema["columns"]
214
- else:
215
- # Unknown format, use base prompt
216
- return base_prompt
217
-
218
- # Build field descriptions from schema
219
- field_descriptions = []
220
- logger.info(f"Building prompt from {len(columns)} columns")
221
- for col in columns:
222
- # Handle both formats: {"column_name": "x"} and {"name": "x"}
223
- name = col.get("column_name") or col.get("name", "")
224
- col_type = col.get("data_type") or col.get("type", "")
225
- logger.info(f"Processing column: {name} ({col_type})")
226
-
227
- # Skip system fields
228
- if name and name not in ["id", "created_at", "updated_at", "status", "raw_json"]:
229
- # Add helpful descriptions for key fields
230
- if name == "vendor_name":
231
- field_descriptions.append(f"- {name}: The company name at the top of the invoice")
232
- elif name == "invoice_number":
233
- field_descriptions.append(f"- {name}: The invoice number or ID")
234
- elif name == "invoice_date":
235
- field_descriptions.append(f"- {name}: The date the invoice was issued (YYYY-MM-DD format)")
236
- elif name == "total_amount":
237
- field_descriptions.append(f"- {name}: The total invoice amount")
238
- elif name == "due_date":
239
- field_descriptions.append(f"- {name}: The invoice due date (YYYY-MM-DD format)")
240
- elif name == "tax_amount":
241
- field_descriptions.append(f"- {name}: The tax amount on the invoice")
242
- elif name == "line_items":
243
- field_descriptions.append(f"- {name}: Array of items with descriptions and amounts")
244
- else:
245
- field_descriptions.append(f"- {name}: {col_type}")
246
-
247
- # If we have field descriptions, use them; otherwise use base prompt
248
- logger.info(f"Built {len(field_descriptions)} field descriptions")
249
- if field_descriptions:
250
- schema_text = "\n".join(field_descriptions)
251
- full_prompt = f'''Extract invoice data from this image and return ONLY a JSON object with these specific fields:
252
- {schema_text}
253
-
254
- Look specifically for the company/vendor name prominently displayed on the invoice.
255
-
256
- Return ONLY valid JSON with no additional text or explanation.'''
257
- logger.info(f"Returning schema-based prompt with {len(field_descriptions)} fields")
258
- return full_prompt
259
- else:
260
- logger.info("No field descriptions built, returning base prompt")
261
- return base_prompt
262
-
263
- def _call_vision_model_with_schema(self, image_path: str, prompt: str) -> str:
264
- """Call vision model with image and comprehensive prompt using Hugging Face"""
265
- try:
266
- # Import Hugging Face client
267
- from huggingface_hub import InferenceClient
268
-
269
- # Get API key from environment
270
- api_key = os.getenv("HUGGINGFACE_API_KEY")
271
-
272
- if not api_key:
273
- logger.error("HUGGINGFACE_API_KEY environment variable is not set")
274
- return json.dumps({
275
- "error": "Hugging Face API key not configured",
276
- "message": "Please set HUGGINGFACE_API_KEY environment variable",
277
- "expected_structure": {
278
- "vendor_name": "string",
279
- "invoice_number": "string",
280
- "invoice_date": "YYYY-MM-DD",
281
- "due_date": "YYYY-MM-DD",
282
- "amount": 0.0,
283
- "tax_amount": 0.0,
284
- "line_items": "[]"
285
- }
286
- })
287
-
288
- # Log the API key format for debugging (first few characters)
289
- logger.info(f"Using Hugging Face API key: {api_key[:10]}...")
290
-
291
- # Create Hugging Face client with correct parameter name
292
- client = InferenceClient(token=api_key)
293
-
294
- # Encode image to base64
295
- def encode_image(image_path):
296
- with open(image_path, "rb") as f:
297
- return base64.b64encode(f.read()).decode("utf-8")
298
-
299
- base64_image = encode_image(image_path)
300
-
301
- # Log the request details for debugging
302
- logger.info(f"Making request to Hugging Face with model: meta-llama/Llama-4-Maverick-17B-128E-Instruct")
303
- logger.info(f"Prompt length: {len(prompt)} characters")
304
- logger.info(f"Image base64 length: {len(base64_image)} characters")
305
-
306
- # Call the model using the working approach - exactly as in your example
307
- response = client.chat.completions.create(
308
- model="meta-llama/Llama-4-Maverick-17B-128E-Instruct",
309
- messages=[
310
- {
311
- "role": "user",
312
- "content": [
313
- {"type": "text", "text": prompt},
314
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}
315
- ]
316
- }
317
- ],
318
- max_tokens=500,
319
- )
320
-
321
- # Extract the response content
322
- extracted_text = response.choices[0].message.content
323
-
324
- logger.info(f"Hugging Face API call successful")
325
- logger.info(f"Response length: {len(extracted_text)} characters")
326
-
327
- return extracted_text
328
-
329
- except Exception as e:
330
- logger.error(f"Vision model call failed: {str(e)}")
331
- return json.dumps({
332
- "error": f"Vision model processing failed - {str(e)}"
333
- })
334
-
335
- def _convert_to_mcp_format(self, extracted_data: Dict[str, Any]) -> Dict[str, Any]:
336
- """Convert extracted data to MCP bridge expected format"""
337
- try:
338
- # Handle nested response structure from vision model
339
- # The vision model might return data in a nested structure like:
340
- # {"data": {"invoice_number": "123", "vendor_name": "ABC Corp"}}
341
- # or directly: {"invoice_number": "123", "vendor_name": "ABC Corp"}
342
-
343
- # If the data is nested, extract it
344
- if isinstance(extracted_data, dict) and "data" in extracted_data:
345
- actual_data = extracted_data["data"]
346
- else:
347
- actual_data = extracted_data
348
-
349
- # Handle both expected format and actual vision model output format
350
- # Vision model might return: InvoiceNumber, InvoiceDate, InvoiceTotal, etc.
351
- # Expected format: invoice_number, invoice_date, amount, etc.
352
-
353
- # Extract invoice number (try both formats)
354
- invoice_number = (
355
- actual_data.get("invoice_number") or
356
- actual_data.get("InvoiceNumber") or
357
- actual_data.get("invoiceNumber") or
358
- ""
359
- )
360
-
361
- # Extract invoice date (try both formats)
362
- invoice_date = (
363
- actual_data.get("invoice_date") or
364
- actual_data.get("InvoiceDate") or
365
- actual_data.get("invoiceDate") or
366
- ""
367
- )
368
-
369
- # Convert date format if needed
370
- if invoice_date:
371
- # Convert MM/DD/YY to YYYY-MM-DD format
372
- if "/" in invoice_date and len(invoice_date.split("/")) == 3:
373
- parts = invoice_date.split("/")
374
- month, day, year = parts[0], parts[1], parts[2]
375
- if len(year) == 2:
376
- year = "20" + year
377
- invoice_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
378
-
379
- # Extract due date (try both formats)
380
- due_date = (
381
- actual_data.get("due_date") or
382
- actual_data.get("DueDate") or
383
- actual_data.get("dueDate") or
384
- ""
385
- )
386
-
387
- # Convert due date format if needed
388
- if due_date:
389
- # Convert MM/DD/YY to YYYY-MM-DD format
390
- if "/" in due_date and len(due_date.split("/")) == 3:
391
- parts = due_date.split("/")
392
- month, day, year = parts[0], parts[1], parts[2]
393
- if len(year) == 2:
394
- year = "20" + year
395
- due_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
396
-
397
- # Extract amount (try both formats)
398
- amount = (
399
- actual_data.get("total_amount") or # Add this - matches the prompt
400
- actual_data.get("amount") or
401
- actual_data.get("InvoiceTotal") or
402
- actual_data.get("invoiceTotal") or
403
- actual_data.get("total") or
404
- 0
405
- )
406
-
407
- # Extract vendor name (try both formats)
408
- vendor_name = (
409
- actual_data.get("vendor_name") or
410
- actual_data.get("VendorName") or
411
- actual_data.get("vendorName") or
412
- actual_data.get("Company") or
413
- actual_data.get("company") or
414
- ""
415
- )
416
-
417
- # Extract tax amount (try both formats)
418
- tax_amount = (
419
- actual_data.get("tax_amount") or
420
- actual_data.get("TaxAmount") or
421
- actual_data.get("taxAmount") or
422
- 0
423
- )
424
-
425
- # Extract line items (try both formats)
426
- line_items = (
427
- actual_data.get("line_items") or
428
- actual_data.get("Order") or
429
- actual_data.get("order") or
430
- actual_data.get("LineItems") or
431
- actual_data.get("lineItems") or
432
- []
433
- )
434
-
435
- if isinstance(line_items, str):
436
- try:
437
- line_items = json.loads(line_items)
438
- except:
439
- line_items = []
440
-
441
- # Convert to MCP bridge format
442
- mcp_format = {
443
- "headerSection": {
444
- "vendorName": vendor_name,
445
- "subtotal": float(amount)
446
- },
447
- "billingDetails": {
448
- "invoiceNumber": invoice_number,
449
- "invoiceDate": invoice_date,
450
- "dueDate": due_date
451
- },
452
- "chargesSummary": {
453
- "document_total": float(amount),
454
- "secondary_tax": float(tax_amount),
455
- "lineItemsBreakdown": line_items
456
- },
457
- "status": "processed"
458
- }
459
-
460
- return mcp_format
461
-
462
- except Exception as e:
463
- logger.error(f"Error converting to MCP format: {str(e)}")
464
- return {
465
- "headerSection": {"vendorName": "", "subtotal": 0},
466
- "billingDetails": {"invoiceNumber": "", "invoiceDate": "", "dueDate": ""},
467
- "chargesSummary": {"document_total": 0, "secondary_tax": 0, "lineItemsBreakdown": []},
468
- "status": "conversion_error"
469
- }
470
-
471
- class DatabaseQueryTool:
472
- """Query database schemas and data"""
473
-
474
- def __init__(self, credentials: Dict[str, Any]):
475
- self.credentials = credentials
476
-
477
- def get_schema(self, table_name: str) -> Dict[str, Any]:
478
- """Get database schema for a table"""
479
- # Mock schema for now
480
- return {
481
- "success": True,
482
- "data": {
483
- "table": table_name,
484
- "columns": [
485
- {"name": "id", "type": "integer", "primary_key": True},
486
- {"name": "vendor_name", "type": "text"},
487
- {"name": "invoice_number", "type": "text"},
488
- {"name": "invoice_date", "type": "date"},
489
- {"name": "amount", "type": "decimal"},
490
- {"name": "created_at", "type": "timestamp"}
491
- ]
492
- }
493
- }
494
-
495
- class OCRTool:
496
- """Perform OCR on images and documents"""
497
-
498
- def extract_text(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
499
- """Extract text from document"""
500
- return {
501
- "success": True,
502
- "data": {
503
- "extracted_text": "Sample extracted text from document"
504
- }
505
- }
506
-
507
- class InvoiceExtractionWorkflow:
508
- """Extract structured data from invoices"""
509
-
510
- def __init__(self):
511
- pass
512
-
513
- def extract_data(self, text: str, schema: Dict[str, Any]) -> Dict[str, Any]:
514
- """Extract structured data from invoice text or JSON"""
515
- try:
516
- # Check if the input is already JSON (from vision model)
517
- if isinstance(text, dict):
518
- # Input is already structured data from vision model
519
- invoice_data = text
520
- else:
521
- # Try to parse as JSON first
522
- try:
523
- invoice_data = json.loads(text)
524
- except json.JSONDecodeError:
525
- # Fall back to text parsing
526
- invoice_data = self._parse_text_to_data(text)
527
-
528
- # If we still have empty data, this might be a case where we should
529
- # use the data from a previous tool (PDFProcessor)
530
- if not invoice_data.get("vendor_name") and not invoice_data.get("invoice_number"):
531
- logger.warning("No invoice data found in input - this might be a workflow issue")
532
- return {
533
- "success": False,
534
- "data": {
535
- "extracted_data": {
536
- "vendor_name": "",
537
- "invoice_number": "",
538
- "invoice_date": "",
539
- "amount": 0.0,
540
- "tax_amount": 0.0,
541
- "line_items": "[]",
542
- "status": "no_data_from_previous_tool"
543
- }
544
- }
545
- }
546
-
547
- # Convert date format if needed
548
- if invoice_data.get("invoice_date"):
549
- invoice_data["invoice_date"] = self._convert_date_format(invoice_data["invoice_date"])
550
-
551
- # Ensure line_items is a JSON string
552
- if isinstance(invoice_data.get("line_items"), list):
553
- invoice_data["line_items"] = json.dumps(invoice_data["line_items"])
554
-
555
- # Set status
556
- invoice_data["status"] = "processed"
557
-
558
- return {
559
- "success": True,
560
- "data": {
561
- "extracted_data": invoice_data
562
- }
563
- }
564
-
565
- except Exception as e:
566
- logger.error(f"Invoice extraction failed: {str(e)}")
567
- return {
568
- "success": False,
569
- "data": {
570
- "extracted_data": {
571
- "vendor_name": "",
572
- "invoice_number": "",
573
- "invoice_date": "",
574
- "amount": 0.0,
575
- "tax_amount": 0.0,
576
- "line_items": "[]",
577
- "status": "error"
578
- }
579
- }
580
- }
581
-
582
- def _parse_text_to_data(self, text: str) -> Dict[str, Any]:
583
- """Parse text to extract invoice data (fallback method)"""
584
- lines = text.split('\n')
585
- invoice_data = {
586
- "vendor_name": "",
587
- "invoice_number": "",
588
- "invoice_date": "",
589
- "amount": 0.0,
590
- "tax_amount": 0.0,
591
- "line_items": "[]",
592
- "status": "processed"
593
- }
594
-
595
- # Extract data from the text using real parsing
596
- for line in lines:
597
- line = line.strip()
598
- if "Invoice Number:" in line:
599
- invoice_data["invoice_number"] = line.split(":")[1].strip()
600
- elif "Invoice Date:" in line:
601
- invoice_data["invoice_date"] = line.split(":")[1].strip()
602
- elif "Order total:" in line:
603
- amount_str = line.split(":")[1].strip()
604
- try:
605
- invoice_data["amount"] = float(amount_str)
606
- except:
607
- pass
608
- elif "GST - HST / TPS -TVH:" in line:
609
- tax_str = line.split(":")[1].strip()
610
- try:
611
- invoice_data["tax_amount"] = float(tax_str)
612
- except:
613
- pass
614
- elif "SUPERIOR PROPANE" in line:
615
- invoice_data["vendor_name"] = "SUPERIOR PROPANE"
616
- elif "CHEP CANADA INC" in line:
617
- invoice_data["vendor_name"] = "CHEP CANADA INC"
618
-
619
- return invoice_data
620
-
621
- def _convert_date_format(self, date_str: str) -> str:
622
- """Convert date from MM/DD/YY to YYYY-MM-DD format"""
623
- try:
624
- # Handle MM/DD/YY format
625
- if "/" in date_str and len(date_str.split("/")) == 3:
626
- parts = date_str.split("/")
627
- month, day, year = parts[0], parts[1], parts[2]
628
-
629
- # Convert 2-digit year to 4-digit
630
- if len(year) == 2:
631
- year = "20" + year
632
-
633
- # Ensure proper formatting
634
- return f"{year}-{month.zfill(2)}-{day.zfill(2)}"
635
-
636
- return date_str
637
- except:
638
- return date_str
639
-
640
- class DataValidator:
641
- """Validate data against schemas"""
642
-
643
- def validate(self, data: Dict[str, Any], schema: Dict[str, Any]) -> Dict[str, Any]:
644
- """Validate data against schema"""
645
- return {
646
- "success": True,
647
- "data": {
648
- "valid": True,
649
- "errors": []
650
- }
651
- }
652
-
653
- class PostgresInsert:
654
- """Insert data into PostgreSQL database"""
655
-
656
- def __init__(self, credentials: Dict[str, Any]):
657
- self.credentials = credentials
658
-
659
- def insert_record(self, table: str, data: Dict[str, Any]) -> Dict[str, Any]:
660
- """Insert a record into the database"""
661
- return {
662
- "success": True,
663
- "data": {
664
- "table": table,
665
- "inserted_id": 123,
666
- "message": "Record inserted successfully"
667
- }
668
- }