idvpackage 3.0.11__py3-none-any.whl → 3.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idvpackage/common.py +8 -966
- idvpackage/iraq_id_extraction_withopenai.py +374 -893
- idvpackage/jor_passport_extraction.py +1 -6
- idvpackage/liveness_spoofing_v2.py +2 -45
- idvpackage/ocr.py +1016 -2430
- idvpackage/ocr_utils.py +148 -489
- idvpackage/pse_passport_extraction.py +18 -292
- idvpackage/qatar_id_extraction.py +4 -956
- idvpackage/sudan_passport_extraction.py +0 -928
- idvpackage/syr_passport_extraction.py +27 -402
- idvpackage/uae_id_extraction.py +87 -151
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/METADATA +1 -1
- idvpackage-3.0.13.dist-info/RECORD +34 -0
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/WHEEL +1 -1
- idvpackage/ekyc.py +0 -78
- idvpackage/genai_utils.py +0 -309
- idvpackage/iraq_id_extraction.py +0 -992
- idvpackage/iraq_passport_extraction.py +0 -588
- idvpackage/lazy_imports.py +0 -44
- idvpackage/lebanon_passport_extraction.py +0 -161
- idvpackage/sau_id_extraction.py +0 -248
- idvpackage/sudan_id_extraction.py +0 -764
- idvpackage-3.0.11.dist-info/RECORD +0 -42
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/licenses/LICENSE +0 -0
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/top_level.txt +0 -0
|
@@ -1,220 +1,319 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
import json
|
|
2
|
-
import time
|
|
3
|
-
import datetime
|
|
4
|
-
import openai
|
|
5
|
-
from langchain.tools import tool
|
|
6
|
-
from langchain.tools.render import format_tool_to_openai_function
|
|
7
|
-
from langchain.prompts import ChatPromptTemplate
|
|
8
|
-
from langchain.chat_models import ChatOpenAI
|
|
9
|
-
from pydantic import BaseModel, Field, validator
|
|
10
|
-
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
|
|
11
|
-
from typing import Optional, Literal
|
|
12
|
-
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
|
|
13
|
-
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
|
|
14
|
-
import idvpackage.genai_utils as genai_utils
|
|
15
|
-
import idvpackage.genai_utils as sanity_utils
|
|
16
|
-
from datetime import datetime, timedelta
|
|
17
|
-
from dateutil.relativedelta import relativedelta
|
|
18
|
-
from pydantic import ValidationError
|
|
19
3
|
import logging
|
|
20
|
-
|
|
21
|
-
# import base64
|
|
22
|
-
# import time
|
|
23
|
-
# from io import BytesIO
|
|
24
|
-
# from typing import Set, List, Optional
|
|
25
|
-
# import json
|
|
26
|
-
# import cv2
|
|
27
|
-
# import torch
|
|
28
|
-
# from PIL import Image
|
|
29
|
-
# from openai import OpenAI
|
|
30
|
-
# from pydantic import BaseModel, Field, validator
|
|
31
|
-
# import logging
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
# logging.basicConfig(
|
|
36
|
-
# level=logging.INFO,
|
|
37
|
-
# format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s',
|
|
38
|
-
# datefmt='%Y-%m-%d %H:%M:%S',
|
|
39
|
-
# force=True
|
|
40
|
-
# )
|
|
41
|
-
|
|
42
|
-
class Verify_IRQ_Passport(BaseModel):
|
|
43
|
-
"""Validates whether a given OCR text represents a valid Iraqi Passport"""
|
|
44
|
-
is_valid_id: Literal["True", "False"] = Field(..., description="Return True if document is a valid Iraqi Passport"
|
|
45
|
-
"It should contain Arabic/Kurdish text like: جمهورية العراق, کۆماری عێراق and English Text: Republic of Iraq"
|
|
46
|
-
"Return False otherwise.")
|
|
47
|
-
side: Literal["passport", ""] = Field(..., description="Return passport if the document is a valid Iraqi Passport")
|
|
4
|
+
import time
|
|
48
5
|
|
|
49
|
-
class Iraq_Passport(BaseModel):
|
|
50
|
-
"""Extract the fields from the OCR extracted text of an Iraqi Passport"""
|
|
51
|
-
full_name: str = Field(..., description="Full name of the person on the passport")
|
|
52
|
-
last_name: str = Field(..., description="Surname of the person on the passport")
|
|
53
|
-
dob: str = Field(..., description="Date of Birth")
|
|
54
|
-
place_of_birth: str = Field(...,
|
|
55
|
-
description=(
|
|
56
|
-
"Place of Birth of the person on the passport"
|
|
57
|
-
"DO NOT mix it up with Issuing Authority"
|
|
58
|
-
"Translate to English"
|
|
59
|
-
)
|
|
60
|
-
)
|
|
61
|
-
mother_name: str = Field(..., description="Mother's full name")
|
|
62
|
-
gender_letter: str = Field(..., description="Gender/Sex of the person on the passport. It is either M or F.")
|
|
63
|
-
issuing_authority: str = Field(...,
|
|
64
|
-
description=(
|
|
65
|
-
"Issuing Authority"
|
|
66
|
-
"Translate to English"
|
|
67
|
-
)
|
|
68
|
-
)
|
|
69
|
-
nationality: str = Field(..., description="Nationality in ISO 3166-1 alpha-3 format (e.g., 'IRQ' for Iraqi, 'JOR' for Jordanian)", example="IRQ")
|
|
70
|
-
issuing_country: str = Field(..., description="Issuing Country/Country Code (e.g. 'IRQ', 'JOR')", example='IRQ')
|
|
71
|
-
id_number: str = Field(..., description="9-character alphanumeric passport number.")
|
|
72
|
-
mrz1: str = Field(...,
|
|
73
|
-
description=(
|
|
74
|
-
"MRZ Line 1."
|
|
75
|
-
"Should be exactly 44 characters long."
|
|
76
|
-
"If OCR splits it across lines, join them into one."
|
|
77
|
-
"Do not confuse with MRZ Line 2 — Line 1 typically starts with 'P<' and contains names."
|
|
78
|
-
)
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
mrz2: str = Field(...,
|
|
82
|
-
description=(
|
|
83
|
-
"MRZ Line 2."
|
|
84
|
-
"Should be exactly 44 characters long."
|
|
85
|
-
"If OCR splits it across lines, join them into one string."
|
|
86
|
-
"Do not confuse with MRZ Line 1 — Line 2 contains passport number, nationality, DOB, expiry, etc."
|
|
87
|
-
)
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
@validator("mrz2")
|
|
91
|
-
def validate_mrz2_content_length(cls, v):
|
|
92
|
-
if len(v.replace('<', '')) < 28:
|
|
93
|
-
raise ValueError("cropped_mrz")
|
|
94
|
-
return v
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
@tool(args_schema=Iraq_Passport)
|
|
98
|
-
def sanity_check_irq_passport(full_name='',
|
|
99
|
-
last_name='',
|
|
100
|
-
dob='',
|
|
101
|
-
place_of_birth='',
|
|
102
|
-
mother_name='',
|
|
103
|
-
gender_letter='',
|
|
104
|
-
issuing_authority='',
|
|
105
|
-
nationality='',
|
|
106
|
-
issuing_country='',
|
|
107
|
-
id_number='',
|
|
108
|
-
mrz='',
|
|
109
|
-
mrz1='',
|
|
110
|
-
mrz2=''):
|
|
111
|
-
try:
|
|
112
6
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
#
|
|
116
|
-
# if len(mrz2)<44:
|
|
117
|
-
# return {'error': 'covered_photo', 'error_details': 'cropped mrz'}
|
|
7
|
+
from openai import OpenAI
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
118
9
|
|
|
119
|
-
# if len(mrz2.replace('<',''))<30:
|
|
120
|
-
# return {'error': 'covered_photo', 'error_details': 'cropped mrz'}
|
|
121
10
|
|
|
122
11
|
|
|
123
|
-
|
|
12
|
+
PROMPT_PASSPORT_IRQ = """
|
|
13
|
+
You are given an image of an Iraqi passport. Your task is to read the passport and return ONLY a valid JSON object matching the IraqiPassport schema.
|
|
124
14
|
|
|
15
|
+
Follow all rules exactly. Do NOT guess or invent any value.
|
|
125
16
|
|
|
17
|
+
================================================
|
|
18
|
+
1. LANGUAGE RULES
|
|
19
|
+
================================================
|
|
126
20
|
|
|
127
|
-
|
|
21
|
+
- Read all visible text in English, Arabic, and Kurdish.
|
|
22
|
+
- Prefer English if it exists.
|
|
23
|
+
- If the **field label is in English but the VALUE is in Arabic/Kurdish**,
|
|
24
|
+
→ transliterate the VALUE into English letters.
|
|
25
|
+
- If a field appears ONLY in Arabic/Kurdish:
|
|
26
|
+
→ transliterate it to English letters (romanization).
|
|
27
|
+
- Do NOT translate or change meaning.
|
|
28
|
+
- If any value is unreadable or missing → return "".
|
|
128
29
|
|
|
129
|
-
|
|
130
|
-
|
|
30
|
+
================================================
|
|
31
|
+
2. FIELDS TO EXTRACT (IraqiPassport schema)
|
|
32
|
+
================================================
|
|
131
33
|
|
|
132
|
-
|
|
133
|
-
expiry_date = sanity_utils.parse_yymmdd(expiry_date) # string 'YYYY-MM-DD'
|
|
134
|
-
is_doc_expired = sanity_utils.is_expired_id(expiry_date)
|
|
34
|
+
You must fill ALL fields:
|
|
135
35
|
|
|
136
|
-
|
|
137
|
-
|
|
36
|
+
1) Names:
|
|
37
|
+
- `full_name`: English full name exactly as printed.
|
|
38
|
+
- `last_name`: English surname exactly as printed.
|
|
39
|
+
- `mother_name`: Mother's name in English.
|
|
40
|
+
- If any of these appear only in Arabic/Kurdish → transliterate the value.
|
|
138
41
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
42
|
+
2) Place of Birth:
|
|
43
|
+
- `place_of_birth`:
|
|
44
|
+
- If English value exists → return it exactly.
|
|
45
|
+
- If value is shown in Arabic/Kurdish (even beside English label)
|
|
46
|
+
→ transliterate only the VALUE into English letters.
|
|
47
|
+
|
|
48
|
+
3) Issuing Authority:
|
|
49
|
+
- `Issuing_authority`:
|
|
50
|
+
- If English value exists → return exactly.
|
|
51
|
+
- If the value under the English label is Arabic/Kurdish
|
|
52
|
+
→ transliterate only the VALUE into English letters.
|
|
53
|
+
|
|
54
|
+
4) Issuing Country:
|
|
55
|
+
- `issuing_country`: Use English country name or code (e.g. IRQ).
|
|
56
|
+
- If only Arabic/Kurdish appears → transliterate and map correctly.
|
|
57
|
+
|
|
58
|
+
5) Gender:
|
|
59
|
+
- `gender_letter`: “M” or “F” only.
|
|
60
|
+
|
|
61
|
+
6) Dates:
|
|
62
|
+
- `dob`, `issue_date`, `expiry_date`:
|
|
63
|
+
- Convert to DD/MM/YYYY format.
|
|
64
|
+
- If unclear → return empty string.
|
|
65
|
+
|
|
66
|
+
7) Passport Number:
|
|
67
|
+
- `id_number`: Must be 1 uppercase letter + 8 digits.
|
|
68
|
+
- If not clearly readable → return "".
|
|
69
|
+
|
|
70
|
+
8) Nationality:
|
|
71
|
+
- `nationality`: Use 3-letter ISO format (e.g., IRQ).
|
|
72
|
+
- If only Arabic/Kurdish appears (e.g. “عراقي”) → transliterate then map.
|
|
73
|
+
|
|
74
|
+
9) Fallback rule:
|
|
75
|
+
- If ONLY two locations appear (in Arabic/Kurdish) without labels:
|
|
76
|
+
- 1st → place_of_birth
|
|
77
|
+
- 2nd → issuing_authority
|
|
78
|
+
|
|
79
|
+
10) Header:
|
|
80
|
+
- Return True if document is a valid Iraqi Passport"
|
|
81
|
+
"It should contain Arabic/Kurdish text like: جمهورية العراق, کۆماری عێراق and English Text: Republic of Iraq"
|
|
82
|
+
"Return False otherwise.
|
|
83
|
+
================================================
|
|
84
|
+
3. MRZ RULES
|
|
85
|
+
================================================
|
|
86
|
+
|
|
87
|
+
- `mrz1` and `mrz2` must be EXACTLY 44 characters.
|
|
88
|
+
- Allowed: A–Z, 0–9, `<`.
|
|
89
|
+
- No spaces or punctuation.
|
|
90
|
+
- If a line is shorter → pad with `<` at the END.
|
|
91
|
+
- In `mrz2`: final char is check digit; pad BEFORE it.
|
|
92
|
+
- Do not add, remove, or modify characters except padding.
|
|
93
|
+
- date of birth and expiry dates in MRZ must be in the format DD/MM/YYYY without any separators.
|
|
94
|
+
- passport number in MRZ must be exactly 9 characters (1 letter + 8 digits), padded with `<` if necessary.
|
|
95
|
+
- gender_mrz: extract gender from MRZ line 2 if M return `MALE`, if F return `FEMALE`
|
|
96
|
+
- expiry_date_mrz: extract expiry date from MRZ line 2 in DD/MM/YYYY format
|
|
97
|
+
================================================
|
|
98
|
+
4. NO GUESSING & FINAL OUTPUT
|
|
99
|
+
================================================
|
|
100
|
+
|
|
101
|
+
- If any field is missing or unreadable → return an empty string.
|
|
102
|
+
- Do NOT infer values.
|
|
103
|
+
- Output ONLY a single JSON object matching the IraqiPassport schema.
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
"""
|
|
107
|
+
class IraqiPassport(BaseModel):
|
|
108
|
+
# ocr_text: str = Field(..., description="Full OCR extracted text from the Iraqi Passport image.")
|
|
109
|
+
full_name: str = Field(..., description="The Full Name, in English, exactly as printed on the document")
|
|
110
|
+
last_name: str = Field(..., description="Surname of the person on the passport")
|
|
111
|
+
place_of_birth: str = Field(..., description=("If Place of Birth is in English, return exactly as printed."
|
|
112
|
+
"If not present in English, look at the right-hand side of the passport, where it says 'Place of Birth'."
|
|
113
|
+
"Transliterate to English if value of Place of Birth is only in Arabic"))
|
|
114
|
+
|
|
115
|
+
issuing_authority: str = Field(..., description=("Place of passport issuing authority in English"
|
|
116
|
+
"Transliterate to English if issuing authority is only in Arabic"))
|
|
117
|
+
issuing_country: str = Field(..., description="Issuing Country/Country Code (e.g. 'IRQ', 'JOR')", example='IRQ')
|
|
118
|
+
mother_name: str = Field(..., description="Mother's full name in English, exactly as printed.")
|
|
119
|
+
gender_letter: str = Field(..., description="Sex: M or F")
|
|
120
|
+
mrz1: str = Field(..., min_length=44, max_length=44,
|
|
121
|
+
description="First line of the MRZ, exactly 44 characters, padded with '<' at the end if shorter")
|
|
122
|
+
mrz2: str = Field(..., min_length=44, max_length=44,
|
|
123
|
+
description="Second line of the MRZ, exactly 44 characters. Padding with '<' must be inserted before the final check digit.")
|
|
124
|
+
id_number: str = Field(..., pattern=r"^[A-Z][0-9]{8}$",
|
|
125
|
+
description="Passport number: one uppercase letter followed by 8 digits")
|
|
126
|
+
|
|
127
|
+
dob: str = Field(
|
|
128
|
+
..., description="Date of birth in DD/MM/YYYY format"
|
|
129
|
+
)
|
|
130
|
+
issue_date: str = Field(
|
|
131
|
+
..., description="Issue date in DD/MM/YYYY format"
|
|
132
|
+
)
|
|
133
|
+
expiry_date: str = Field(
|
|
134
|
+
..., description="Expiry date in DD/MM/YYYY format"
|
|
135
|
+
)
|
|
136
|
+
nationality: str = Field(
|
|
137
|
+
..., description="Nationality in ISO 3166-1 alpha-3 format (e.g., SDN)"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
header_verified: bool = Field(
|
|
141
|
+
..., description="Return True if document is a valid Iraqi Passport"
|
|
142
|
+
"It should contain Arabic/Kurdish text like: جمهورية العراق, کۆماری عێراق and English Text: Republic of Iraq"
|
|
143
|
+
"Return False otherwise."
|
|
144
|
+
)
|
|
145
|
+
dob_mrz: str = Field(
|
|
146
|
+
..., description="Date of birth as extracted from MRZ (in DD/MM/YYYY format)"
|
|
147
|
+
)
|
|
148
|
+
id_number_mrz: str = Field(
|
|
149
|
+
..., min_length=9, max_length=9, description="ID number as extracted from MRZ"
|
|
150
|
+
)
|
|
151
|
+
expiry_date_mrz: str = Field(
|
|
152
|
+
..., description="Expiry date as extracted from MRZ (in DD/MM/YYYY format)"
|
|
153
|
+
)
|
|
154
|
+
gender_mrz: str = Field(
|
|
155
|
+
..., description="Gender as extracted from MRZ (M or F) if M return MALE else if F return FEMALE"
|
|
156
|
+
)
|
|
201
157
|
|
|
158
|
+
PROMPT_FRONT_IRQ = """
|
|
159
|
+
You will receive an image of the **front side** of an Iraqi National ID Card.
|
|
160
|
+
Follow ALL steps strictly. Do NOT skip, modify, assume, or guess any information.
|
|
161
|
+
|
|
162
|
+
1.OCR (MANDATORY)
|
|
163
|
+
Perform full OCR on the entire image.
|
|
164
|
+
Extract ALL visible Arabic, Kurdish, numbers, and symbols EXACTLY as printed.
|
|
165
|
+
Keep line breaks, spacing, spelling, diacritics, and punctuation.
|
|
166
|
+
Do NOT correct, normalize, or reorder text.
|
|
167
|
+
Return the full OCR text in: raw_ocr_text.
|
|
168
|
+
|
|
169
|
+
2.PERSONAL DETAILS EXTRACTION AND ALL THE INFORMATION REQUESTED IS PRESENT ON THE FRONT SIDE OF THE ID CARD.
|
|
170
|
+
From the OCR text, extract the following fields EXACTLY as printed:
|
|
171
|
+
Extract these fields EXACTLY as printed.
|
|
172
|
+
The ID card contains only Arabic/Kurdish text, so the *_en fields must be generated by translating the Arabic/Kurdish name into its English form.
|
|
173
|
+
This translation is REQUIRED and is NOT considered guessing.
|
|
174
|
+
Only leave *_en fields empty if the Arabic field itself is empty.
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
- first_name: Arabic first name
|
|
178
|
+
- first_name_en: translate the first name into English
|
|
179
|
+
- father_name: Arabic father name
|
|
180
|
+
- father_name_en: translate the father name into English
|
|
181
|
+
- third_name: Arabic paternal grandfather name
|
|
182
|
+
- third_name_en: translate the paternal grandfather name into English
|
|
183
|
+
- last_name: Arabic family/tribal name (empty string if missing)
|
|
184
|
+
- last_name_en: translate the family/tribal name into English (empty string if missing)
|
|
185
|
+
- mother_first_name: Arabic mother first name
|
|
186
|
+
- mother_first_name_en: translate the mother first name into English
|
|
187
|
+
- mother_last_name: Arabic maternal grandfather/mother’s last name
|
|
188
|
+
- mother_last_name_en: translate the maternal grandfather/mother’s last name into English
|
|
189
|
+
|
|
190
|
+
3.Gender:
|
|
191
|
+
You MUST read gender ONLY from the Arabic field labeled "الجنس" or "جنس".
|
|
192
|
+
Do NOT infer gender from the name or photo.
|
|
193
|
+
|
|
194
|
+
- gender_ar: ذكر o
|
|
195
|
+
r أنثى
|
|
196
|
+
- gender: Male or Female
|
|
197
|
+
|
|
198
|
+
4.DOCUMENT DETAILS EXTRACTION
|
|
199
|
+
Extract all document identifiers exactly as printed:
|
|
200
|
+
- id_number: 12-digit National ID Number
|
|
201
|
+
- card_number: 9-character card/document number
|
|
202
|
+
- serial_number: 6-character serial is present vertically (empty if missing)
|
|
203
|
+
- blood_type: e.g., O+, A-, AB+ (empty if missing)
|
|
204
|
+
|
|
205
|
+
5.HEADER VERIFICATION
|
|
206
|
+
Check if the header is present:
|
|
207
|
+
- Must contain at least one of the following Arabic/Kurdish issuing authority texts:
|
|
208
|
+
"جمهورية العراق"
|
|
209
|
+
"وزارة الداخلية"
|
|
210
|
+
"مديرية الأحوال المدنية والجوازات والاقامة"
|
|
211
|
+
"کوماری عیراق"
|
|
212
|
+
"وه زاره تی ناوخو"
|
|
213
|
+
"پریود به را بائی باری شارستانی و پاسپورت و نیشنگه"
|
|
214
|
+
- Set `header_verified = true` only if at least one appears clearly.
|
|
215
|
+
- Otherwise set `header_verified = false`.
|
|
216
|
+
|
|
217
|
+
6.VALIDATION RULES
|
|
218
|
+
|
|
219
|
+
- Do NOT guess, assume, or hallucinate.
|
|
220
|
+
- If a field is missing, unclear, faint, or unreadable → return empty string.
|
|
221
|
+
- Output MUST match the schema exactly.
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
7.OUTPUT FORMAT
|
|
225
|
+
|
|
226
|
+
Return ONLY structured JSON.
|
|
227
|
+
No explanations. No commentary.
|
|
228
|
+
"""
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
PROMPT_BACK_IRQ = """
|
|
233
|
+
You will be provided with an image showing the **front side of an Iraqi National ID Card**.
|
|
234
|
+
Follow every instruction with absolute precision. Do NOT infer, assume, or guess any information.
|
|
235
|
+
|
|
236
|
+
Extract EVERY visible Arabic, Kurdish, English, number, and symbol EXACTLY as printed.
|
|
237
|
+
You MUST capture even faint, blurred, low-contrast, small, or partially visible text.
|
|
238
|
+
|
|
239
|
+
O1.oCR Requirements:
|
|
240
|
+
- Preserve original line breaks and spacing
|
|
241
|
+
- Preserve punctuation, diacritics, and character shapes
|
|
242
|
+
- Preserve text even if rotated, tilted, or near the edges
|
|
243
|
+
- Do NOT correct spelling or normalize text
|
|
244
|
+
- Do NOT merge or reorder lines
|
|
245
|
+
- Do NOT skip faint or partial text
|
|
246
|
+
|
|
247
|
+
Return the full raw OCR output exactly as extracted in: raw_ocr_text.
|
|
248
|
+
|
|
249
|
+
2. MRZ (Machine Readable Zone)
|
|
250
|
+
- mrz1: first MRZ line (exactly as printed, 30 characters, keep '<', remove spaces around it)
|
|
251
|
+
- mrz2: second MRZ line (same rule, 30 characters)
|
|
252
|
+
- mrz3: third MRZ line (same rule, 30 characters)
|
|
253
|
+
If any MRZ line is missing, set it to "".
|
|
254
|
+
|
|
255
|
+
3.HEADER VERIFICATION
|
|
256
|
+
- header_verified: true if "IDIRQ" appears anywhere in mrz1, otherwise false.
|
|
257
|
+
|
|
258
|
+
4. DATE FIELDS (DD/MM/YYYY)
|
|
259
|
+
From raw_ocr_text, extract:
|
|
260
|
+
- dob: date of birth
|
|
261
|
+
- issue_date
|
|
262
|
+
- expiry_date
|
|
263
|
+
Dates must be in DD/MM/YYYY format.
|
|
264
|
+
If a date is missing or unclear, return "" for that field.
|
|
265
|
+
|
|
266
|
+
5.ISSUING AUTHORITY
|
|
267
|
+
- issuing_authority_ar: Arabic text of the issuing authority exactly as printed
|
|
268
|
+
(often like "مديرية الأحوال المدنية - [City]")
|
|
269
|
+
- issuing_authority_en: English translation of the issuing authority
|
|
270
|
+
|
|
271
|
+
6. PLACE OF BIRTH
|
|
272
|
+
|
|
273
|
+
- place_of_birth: Arabic text
|
|
274
|
+
- place_of_birth_en: translation
|
|
275
|
+
|
|
276
|
+
7. NAMES FROM MRZ LINE 3
|
|
277
|
+
From mrz3 (if present):
|
|
278
|
+
- last_name_back: the text before '<<'
|
|
279
|
+
- first_name_back: the text after '<<'
|
|
280
|
+
If mrz3 is empty, both fields must be "".
|
|
281
|
+
|
|
282
|
+
8.FAMILY NUMBER
|
|
283
|
+
- family_number: 18-character alphanumeric value if present in raw_ocr_text, else "".
|
|
284
|
+
-family number_en: same as family_number.
|
|
285
|
+
|
|
286
|
+
9. NATIONALITY
|
|
287
|
+
- nationality: 3-letter code (e.g., "IRQ") if present, else "".
|
|
288
|
+
10.GENDER FROM MRZ LINE 2
|
|
289
|
+
- gender_mrz: 'M' or 'F' extracted from mrz2. If mrz2 is missing, return "".
|
|
290
|
+
11.dob_mrz: date of birth from mrz2 in DD/MM/YYYY format.first six characters from second line mrz. If mrz2 is missing, return "".
|
|
291
|
+
12.expiry_date_mrz: expiry date from mrz2 in DD/MM/YYYY format. If mrz2 is missing, return "".
|
|
292
|
+
13.card_number_mrz: document number from mrz1. If mrz1 is missing, return "".
|
|
293
|
+
|
|
294
|
+
RULES:
|
|
295
|
+
- Never guess or infer. If something is not clearly present in raw_ocr_text, return "".
|
|
296
|
+
- Return ONLY a JSON object matching the schema exactly.
|
|
297
|
+
|
|
298
|
+
"""
|
|
202
299
|
class Iraq_National_ID_front(BaseModel):
|
|
203
|
-
"""Extract the fields from the OCR extracted text of an Iraqi National ID's front side. Front Side has names, (like father name, mother name etc.), national id numbers but has no dates.
|
|
300
|
+
"""Extract the fields from the OCR extracted text of an Iraqi National ID's front side. Front Side has names, (like father name, mother name etc.), national id numbers but has no dates.
|
|
301
|
+
Translate wherever required."""
|
|
302
|
+
ocr_text: str = Field(..., description="Full OCR extracted text from the Iraqi National ID front side image.")
|
|
204
303
|
first_name: str = Field(..., description="First name (الاسم / ناو) in Arabic.")
|
|
205
304
|
first_name_en: str = Field(..., description="First name (الاسم / ناو), translated to English.")
|
|
206
305
|
father_name: str = Field(..., description="Father's name (الأب / باوك) in Arabic.")
|
|
207
306
|
father_name_en: str = Field(..., description="Father's name (الأب / باوك), translated to English.")
|
|
208
307
|
third_name: str = Field(..., description="Paternal grandfather's name (الجد / بابير) in Arabic.")
|
|
209
308
|
third_name_en: str = Field(..., description="Paternal grandfather's name (الجد / بابير), translated to English.")
|
|
210
|
-
last_name:
|
|
309
|
+
last_name: str = Field(
|
|
211
310
|
"",
|
|
212
311
|
description=(
|
|
213
312
|
"Family/tribal name (اللقب / نازناو) in Arabic. "
|
|
214
313
|
"OCR extracts various versions of 'نازناو' like الزناو, الزنار; do not interpret them as the family name."
|
|
215
314
|
)
|
|
216
315
|
)
|
|
217
|
-
last_name_en:
|
|
316
|
+
last_name_en: str = Field(
|
|
218
317
|
"",
|
|
219
318
|
description=(
|
|
220
319
|
"Family/tribal name (اللقب / نازناو), translated to English. "
|
|
@@ -225,94 +324,25 @@ class Iraq_National_ID_front(BaseModel):
|
|
|
225
324
|
mother_first_name_en: str = Field(..., description="Mother's name (الام/ دابك), translated to English.")
|
|
226
325
|
mother_last_name: str = Field(..., description="Maternal grandfather's name (الجد / بابير) in Arabic.")
|
|
227
326
|
mother_last_name_en: str = Field(..., description="Maternal grandfather's name (الجد / بابير), translated to English.")
|
|
228
|
-
gender_ar: str = Field(..., description="Gender (الجنس / ردگار): ذكر (
|
|
229
|
-
gender: str = Field(..., description="Gender (الجنس / ردگار), translated to English")
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
serial_number:
|
|
233
|
-
blood_type:
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
third_name_en='',
|
|
245
|
-
last_name='',
|
|
246
|
-
last_name_en='',
|
|
247
|
-
mother_first_name='',
|
|
248
|
-
mother_first_name_en='',
|
|
249
|
-
mother_last_name='',
|
|
250
|
-
mother_last_name_en='',
|
|
251
|
-
gender_ar='',
|
|
252
|
-
gender='',
|
|
253
|
-
blood_type='',
|
|
254
|
-
serial_number=''
|
|
255
|
-
|
|
256
|
-
) -> dict:
|
|
257
|
-
print("SANITY CHECK IRQ FRONT WAS CALLED")
|
|
258
|
-
"""Run sanity checks on the data extracted from Iraq national ID's front side."""
|
|
259
|
-
#Post-Processing steps
|
|
260
|
-
try:
|
|
261
|
-
if not id_number_front.isdigit() or len(id_number_front) != 12:
|
|
262
|
-
return {'error': 'invalid_national_number', 'error_details': 'invalid national number, please take a clearer picture of your image. Note: We do not accept Civil Status IDs.'}
|
|
263
|
-
|
|
264
|
-
if len(card_number_front) != 9:
|
|
265
|
-
return {'error': 'invalid_document_number', 'error_details': 'invalid document number, please take a clearer picture of your image. Note: We do not accept Civil Status IDs.'}
|
|
266
|
-
|
|
267
|
-
doc_type = 'national_identity_card'
|
|
268
|
-
#at this point, verify_irq_id has run, so we can safely assume the nationality here is IRQ
|
|
269
|
-
nationality='IRQ'
|
|
270
|
-
nationality_en = 'IRQ'
|
|
271
|
-
|
|
272
|
-
optional_fields = ('last_name', 'last_name_en','serial_number','blood_type')
|
|
273
|
-
required_fields = {k: v for k, v in locals().items() if k not in optional_fields}
|
|
274
|
-
|
|
275
|
-
result_dict = {**locals()}
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
if not last_name or not last_name_en:
|
|
283
|
-
name = result_dict.get('first_name', '') + " " + result_dict.get('father_name', '')
|
|
284
|
-
name_en = result_dict.get('first_name_en', '') + " " + result_dict.get('father_name_en', '')
|
|
285
|
-
else:
|
|
286
|
-
name = result_dict.get('first_name', '') + " " + result_dict.get('father_name', '') + " " + result_dict.get('last_name','')
|
|
287
|
-
name_en = result_dict.get('first_name_en', '') + " " + result_dict.get('father_name_en', '')+ " " + result_dict.get("last_name_en",'')
|
|
288
|
-
|
|
289
|
-
missing = [key for key, value in required_fields.items() if not str(value).strip()]
|
|
290
|
-
if missing:
|
|
291
|
-
return {'error': 'covered_photo', 'error_details': f'Missing or empty fields: {", ".join(missing)}'}
|
|
292
|
-
|
|
293
|
-
result = {
|
|
294
|
-
"error": "",
|
|
295
|
-
"error_details": "",
|
|
296
|
-
**locals()
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
if 'required_fields' in result.keys():
|
|
300
|
-
del result['required_fields']
|
|
301
|
-
if 'missing' in result.keys():
|
|
302
|
-
del result['missing']
|
|
303
|
-
if 'optional_fields' in result.keys():
|
|
304
|
-
del result['optional_fields']
|
|
305
|
-
if 'result_dict' in result.keys():
|
|
306
|
-
del result['result_dict']
|
|
307
|
-
return result
|
|
308
|
-
|
|
309
|
-
except Exception as e:
|
|
310
|
-
return {'error':'covered_photo','error_details':e}
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
327
|
+
gender_ar: str = Field(..., description="Gender (الجنس / ردگار): ذكر (Male) or أنثى (Female).")
|
|
328
|
+
gender: str = Field(..., description="Gender (الجنس / ردگار), translated to English male or female ")
|
|
329
|
+
id_number: str = Field(...,min_length=12, max_length=12, description="12-digit national ID number.Must be exactly 12 digits.")
|
|
330
|
+
card_number: str = Field(...,min_length=9, max_length=9, description="9-character alphanumeric document number.Must be exactly 9 characters.")
|
|
331
|
+
serial_number: str = Field("", min_length = 6, max_length=6, description="6-digit card serial number present vertical, its the last thing present in ocr text extracted.")
|
|
332
|
+
blood_type: str = Field(None, description="Blood type (e.g., O+, A-).")
|
|
333
|
+
header_verified: bool = Field(..., description="whether document is a valid Iraqi National ID's front side."
|
|
334
|
+
"It should strictly contain at least one of the following Arabic/Kurdish texts:"
|
|
335
|
+
" جمهورية العراق / وزارة الداخلية"
|
|
336
|
+
"مديرية الأحوال المدنية والجوازات والاقامة"
|
|
337
|
+
"کوماری عیراق / وه زاره تی ناوخو"
|
|
338
|
+
"پریود به را بائی باری شارستانی و پاسپورت و نیشنگه"
|
|
339
|
+
"جمهورية العراق / وزارة الداخلية"
|
|
340
|
+
"کوماری عیراق / وه زاره تی ناوخو")
|
|
341
|
+
|
|
342
|
+
|
|
314
343
|
class Iraq_National_ID_back(BaseModel):
|
|
315
344
|
"""Extract only the Arabic fields from the OCR text of an Iraqi National ID's back side. A back side has fields like dates: issue, expiry, birth. Translate where required."""
|
|
345
|
+
ocr_text: str = Field(..., description="Full OCR extracted text from the Iraqi National ID back side image.")
|
|
316
346
|
issuing_authority: str = Field(..., description="Issuing authority (جهة الاصدار / لايانى ددرجوون) in Arabic")
|
|
317
347
|
issuing_authority_en: str = Field(..., description="Issuing authority (جهة الاصدار / لايانى ددرجوون), translated to English")
|
|
318
348
|
issue_date: str = Field(..., description="Date of issue")
|
|
@@ -320,644 +350,95 @@ class Iraq_National_ID_back(BaseModel):
|
|
|
320
350
|
place_of_birth: str = Field(..., description="Place of birth in Arabic.")
|
|
321
351
|
place_of_birth_en: str = Field(..., description="Place of birth, translated to English.")
|
|
322
352
|
dob: str = Field(..., description="Date of birth")
|
|
323
|
-
family_number: str = Field(..., description='18-character alphanumeric Family number (الرقم العائلي / ژمارەى خێزانی)')
|
|
353
|
+
family_number: str = Field(..., min_length=18, max_length=18, description='18-character alphanumeric Family number (الرقم العائلي / ژمارەى خێزانی)')
|
|
354
|
+
family_number_en: str = Field(..., min_length=18, max_length=18, description='18-character alphanumeric Family number same as family number (الرقم العائلي / ژمارەى خێزانی)')
|
|
324
355
|
mrz1: str = Field(...,description="MRZ Line 1: Includes document type (ID), issuing country code (IRQ), document number, and check digits. Example: 'IDIRQAL36266736200026108063<<<'")
|
|
325
356
|
mrz2: str = Field(...,description="MRZ Line 2: Encodes date of birth (YYMMDD), gender (M/F), expiry date (YYMMDD), and nationality code (IRQ) and check digit at the end of '<<<<<<'. Example: '0007191M2811280IRQ<<<<<<<<<<<7'")
|
|
326
357
|
mrz3: str = Field(...,description="MRZ Line 3: Contains surname and given name(s), separated by '<<'. Given names may include multiple parts separated by '<'. If no surname is present, it starts with '<<'. Example: 'AHMED<<ALI<HASSAN' or '<<ALI'")
|
|
358
|
+
gender_mrz: str = Field(...,description="Gender extracted from MRZ line 2: 'M' for Male, 'F' for female.")
|
|
359
|
+
expiry_date_mrz: str = Field(...,description="Expiry date extracted from MRZ line 2 in DD/MM/YYYY format.")
|
|
360
|
+
dob_mrz: str = Field(...,description="Date of birth as extracted from MRZ (in DD/MM/YYYY format) first six characters of mrz2")
|
|
327
361
|
last_name_back: str = Field(...,description="Surname extracted from MRZ line 3, before the '<<' separator.")
|
|
328
362
|
first_name_back: str = Field(...,description="Given name extracted from MRZ line 3, after the '<<' seperator.")
|
|
363
|
+
header_verified: bool = Field(..., description="if header contains in mrz1 'IDIRQ' then true else false")
|
|
364
|
+
card_number_mrz: str = Field(..., min_length = 9, max_length=9, description="Document number as extracted from MRZ line 1")
|
|
329
365
|
|
|
330
366
|
|
|
331
|
-
@tool(args_schema=Iraq_National_ID_back)
|
|
332
|
-
def sanity_check_irq_back(
|
|
333
|
-
issuing_authority='',
|
|
334
|
-
issuing_authority_en='',
|
|
335
|
-
issue_date='',
|
|
336
|
-
expiry_date='',
|
|
337
|
-
place_of_birth='',
|
|
338
|
-
place_of_birth_en='',
|
|
339
|
-
dob='', mrz1='', mrz2='', mrz3='',
|
|
340
|
-
last_name_back='',
|
|
341
|
-
first_name_back='',
|
|
342
|
-
family_number=''
|
|
343
|
-
):
|
|
344
|
-
try:
|
|
345
|
-
#===========Post-Processing==============
|
|
346
|
-
print("SANITY CHECK IRQ BACK WAS CALLED")
|
|
347
|
-
"""Run sanity checks on the data extracted from Iraq national ID's back side."""
|
|
348
|
-
doc_type = 'national_identity_card'
|
|
349
|
-
|
|
350
|
-
family_number = sanity_utils.fix_family_number(family_number)
|
|
351
|
-
|
|
352
|
-
family_number_en = family_number
|
|
353
|
-
|
|
354
|
-
#At this point, verify_irq_id has been run, so we can safely say its an Iraqi ID.
|
|
355
|
-
nationality='IRQ'
|
|
356
|
-
issuing_country='IRQ'
|
|
357
|
-
|
|
358
|
-
if mrz1:
|
|
359
|
-
card_number = mrz1.strip()[5:14]
|
|
360
|
-
card_number_back = mrz1.strip()[5:15]
|
|
361
|
-
id_number = mrz1.strip()[15:27]
|
|
362
|
-
mrz = [mrz1 + mrz2 + mrz3]
|
|
363
|
-
|
|
364
|
-
else:
|
|
365
|
-
return {'error':'covered_photo', 'error_details':'cropped_mrz'}
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
#==============Sanity checks for blur detection and/or cropped image
|
|
370
|
-
valid_expiry_issue = sanity_utils.is_expiry_issue_diff_valid(issue_date,expiry_date, 10)
|
|
371
|
-
age_check = sanity_utils.is_age_18_above(dob)
|
|
372
|
-
dob_match_mrz_dob = sanity_utils.is_mrz_dob_mrz_field_match(dob, mrz2)
|
|
373
367
|
|
|
374
|
-
is_doc_expired = sanity_utils.is_expired_id(expiry_date)
|
|
375
368
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
gender_back = sanity_utils.find_gender_from_back(mrz2.strip())
|
|
381
|
-
else:
|
|
382
|
-
gender_back=''
|
|
369
|
+
def process_image_irq(side):
|
|
370
|
+
if side == "front":
|
|
371
|
+
prompt = PROMPT_FRONT_IRQ
|
|
372
|
+
model = Iraq_National_ID_front
|
|
383
373
|
|
|
374
|
+
elif side == "back":
|
|
375
|
+
prompt = PROMPT_BACK_IRQ
|
|
376
|
+
model = Iraq_National_ID_back
|
|
384
377
|
|
|
378
|
+
elif side == "page1":
|
|
379
|
+
prompt = PROMPT_PASSPORT_IRQ
|
|
380
|
+
model = IraqiPassport
|
|
381
|
+
else:
|
|
382
|
+
raise ValueError("Invalid document side specified. Use 'front', 'back', or 'passport'.")
|
|
385
383
|
|
|
386
|
-
|
|
387
|
-
return {'error':'covered_photo', 'error_details':'blur or cropped or low-quality image'}
|
|
384
|
+
return model, prompt
|
|
388
385
|
|
|
389
386
|
|
|
390
|
-
#Check required fields
|
|
391
|
-
optional_fields = ('last_name_back','first_name_back')
|
|
392
|
-
required_fields = {k: v for k, v in locals().items() if k not in optional_fields}
|
|
393
387
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
'error_details': f'Missing or empty fields: {", ".join(missing)}'
|
|
399
|
-
}
|
|
388
|
+
def get_openai_response_irq(prompt: str, model_type, image: str, genai_key):
|
|
389
|
+
# covert byte io to utf-8 string
|
|
390
|
+
|
|
391
|
+
for attempt in range(3):
|
|
400
392
|
try:
|
|
401
|
-
|
|
402
|
-
|
|
393
|
+
client = OpenAI(api_key=genai_key)
|
|
394
|
+
response = client.responses.parse(
|
|
395
|
+
model="gpt-4.1-mini",
|
|
396
|
+
input=[
|
|
397
|
+
{
|
|
398
|
+
"role": "system",
|
|
399
|
+
"content": "You are an expert at extracting information from identity documents.",
|
|
400
|
+
},
|
|
401
|
+
{
|
|
402
|
+
"role": "user",
|
|
403
|
+
"content": [
|
|
404
|
+
{"type": "input_text", "text": prompt},
|
|
405
|
+
{
|
|
406
|
+
"type": "input_image",
|
|
407
|
+
"image_url": f"data:image/jpeg;base64,{image}",
|
|
408
|
+
"detail": "high",
|
|
409
|
+
},
|
|
410
|
+
],
|
|
411
|
+
},
|
|
412
|
+
],
|
|
413
|
+
text_format=model_type,
|
|
414
|
+
)
|
|
415
|
+
return response.output_parsed
|
|
403
416
|
except Exception as e:
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
}
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
result = {
|
|
411
|
-
"error": "",
|
|
412
|
-
"error_details": "",
|
|
413
|
-
**locals()
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
if 'required_fields' in result.keys():
|
|
417
|
-
del result['required_fields']
|
|
418
|
-
if 'missing' in result.keys():
|
|
419
|
-
del result['missing']
|
|
420
|
-
if 'optional_fields' in result.keys():
|
|
421
|
-
del result['optional_fields']
|
|
422
|
-
return result
|
|
423
|
-
except Exception as e:
|
|
424
|
-
return {'error':'covered_photo','error_details':e}
|
|
417
|
+
logging.info(f"[ERROR] Attempt {attempt + 1} failed: {str(e)}")
|
|
418
|
+
time.sleep(2)
|
|
419
|
+
return None
|
|
425
420
|
|
|
426
421
|
|
|
427
422
|
|
|
428
|
-
def
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
return tools[result.tool].run(result.tool_input)
|
|
438
|
-
|
|
439
|
-
def route_verification(result):
|
|
440
|
-
if isinstance(result,AgentFinish):
|
|
441
|
-
return ''
|
|
442
|
-
else:
|
|
443
|
-
return result.tool_input
|
|
444
|
-
|
|
445
|
-
def extraction_chain(ocr_text, openai_key, side = ''):
|
|
423
|
+
def get_response_from_openai_irq(image, side, openai_key):
|
|
424
|
+
logging.info(f"Processing Iraqi document side: {side}")
|
|
425
|
+
image_bytes = base64.b64encode(image).decode('utf-8')
|
|
426
|
+
try:
|
|
427
|
+
model, prompt = process_image_irq(side)
|
|
428
|
+
except ValueError as ve:
|
|
429
|
+
logging.error(f"Error: {ve}")
|
|
430
|
+
return {"error": str(ve)}
|
|
431
|
+
logging.info(f"Using prompt for side and prompt {side} selected.{prompt[:50]}...")
|
|
446
432
|
try:
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
model = ChatOpenAI(model=gpt_model, temperature=0,
|
|
452
|
-
openai_api_key=openai_key)
|
|
453
|
-
extraction_functions = [format_tool_to_openai_function(f) for f in tools_func]
|
|
454
|
-
extraction_model = model.bind(functions=extraction_functions)
|
|
455
|
-
|
|
456
|
-
prompt = ChatPromptTemplate.from_messages([
|
|
457
|
-
("system",
|
|
458
|
-
"Extract the relevant information, if not explicitly provided do not guess, leave empty string. Extract partial info. Translate values wherever it is required."
|
|
459
|
-
),
|
|
460
|
-
("user", "{ocr_text}")
|
|
461
|
-
])
|
|
462
|
-
|
|
463
|
-
prompt_verify_doc = ChatPromptTemplate.from_messages([
|
|
464
|
-
("system", "Verify the relevant document."
|
|
465
|
-
),
|
|
466
|
-
("user", "{ocr_text}")
|
|
467
|
-
])
|
|
468
|
-
|
|
469
|
-
model_verification = ChatOpenAI(model=gpt_model, temperature=0,
|
|
470
|
-
openai_api_key=openai_key)
|
|
471
|
-
verification_function = [convert_pydantic_to_openai_function(Verify_IRQ_ID), convert_pydantic_to_openai_function(Verify_IRQ_Passport)]
|
|
472
|
-
verification_model = model_verification.bind(functions=verification_function)
|
|
473
|
-
verification_chain = prompt_verify_doc | verification_model | OpenAIFunctionsAgentOutputParser() | route_verification
|
|
474
|
-
st = time.time()
|
|
475
|
-
verification_model_result = verification_chain.invoke({"ocr_text":ocr_text})
|
|
476
|
-
logging.info(f'----------------Time taken for Verification Chain: {time.time() - st} seconds\n')
|
|
477
|
-
if verification_model_result == '':
|
|
478
|
-
if side=='front':
|
|
479
|
-
return {'error':f'not_front_id'}, ''
|
|
480
|
-
if side=='back':
|
|
481
|
-
return {'error':f'not_back_id'}, ''
|
|
482
|
-
if side=='page1':
|
|
483
|
-
return {'error': f'not_passport'}, ''
|
|
484
|
-
else:
|
|
485
|
-
return {'error':'covered_photo'}
|
|
486
|
-
else:
|
|
487
|
-
is_valid_id = verification_model_result.get("is_valid_id","")
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
if verification_model_result.get("side","")=='passport':
|
|
491
|
-
side_predicted='page1'
|
|
492
|
-
|
|
493
|
-
else:
|
|
494
|
-
side_predicted = verification_model_result.get("side","")
|
|
495
|
-
print("Side Predicted:", side_predicted)
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
if is_valid_id=="True" and side==side_predicted:
|
|
500
|
-
max_retries = 2
|
|
501
|
-
st = time.time()
|
|
502
|
-
for attempt in range(max_retries+1):
|
|
503
|
-
extraction_chain = prompt | extraction_model | OpenAIFunctionsAgentOutputParser() | route
|
|
504
|
-
data = extraction_chain.invoke({"ocr_text": ocr_text})
|
|
505
|
-
|
|
506
|
-
if data.get('error')=='':
|
|
507
|
-
return data, side_predicted
|
|
508
|
-
if data.get('error')!='' and attempt>=max_retries:
|
|
509
|
-
return data, side_predicted
|
|
510
|
-
elif data.get('error')!='' and attempt<max_retries:
|
|
511
|
-
print("RETRYING")
|
|
512
|
-
time.sleep(2)
|
|
513
|
-
continue
|
|
514
|
-
#Only for testing purpose, comment out when pushing to production.
|
|
515
|
-
# if is_valid_id=="True" and side=='auto':
|
|
516
|
-
# max_retries = 2
|
|
517
|
-
# for attempt in range(max_retries+1):
|
|
518
|
-
# extraction_chain = prompt | extraction_model | OpenAIFunctionsAgentOutputParser() | route
|
|
519
|
-
# data = extraction_chain.invoke({"ocr_text": ocr_text})
|
|
520
|
-
|
|
521
|
-
if data.get('error')=='':
|
|
522
|
-
return data, side_predicted
|
|
523
|
-
if data.get('error')!='' and attempt>=max_retries:
|
|
524
|
-
return data, side_predicted
|
|
525
|
-
elif data.get('error')!='' and attempt<max_retries:
|
|
526
|
-
print("RETRYING")
|
|
527
|
-
time.sleep(2)
|
|
528
|
-
continue
|
|
529
|
-
logging.info(f'----------------Time taken for Extraction Chain: {time.time() - st} seconds\n')
|
|
530
|
-
|
|
531
|
-
else:
|
|
532
|
-
if side=='' or side=='auto':
|
|
533
|
-
side = side_predicted
|
|
534
|
-
error = f"not_{side}_id"
|
|
535
|
-
return {'error':error}, side
|
|
536
|
-
if side=='front' or side=='back':
|
|
537
|
-
return {'error':f'not_{side}_id'}, side
|
|
538
|
-
elif side=='page1':
|
|
539
|
-
return {'error':'not_passport'}, side
|
|
540
|
-
|
|
541
|
-
except ValidationError as e:
|
|
542
|
-
errors = e.errors() # list of error dicts
|
|
543
|
-
# Extract all messages
|
|
544
|
-
error = [error['msg'] for error in errors]
|
|
545
|
-
return {'error':error[0], 'error_details': 'cropped mrz'},''
|
|
433
|
+
start_time = time.time()
|
|
434
|
+
response = get_openai_response_irq(prompt, model,image_bytes, openai_key)
|
|
435
|
+
elapsed_time = time.time() - start_time
|
|
436
|
+
logging.info(f"OpenAI extraction took {elapsed_time:.2f} seconds")
|
|
546
437
|
except Exception as e:
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
# ocr_text = llm_ocr_extraction(base_64_image)
|
|
554
|
-
# result,side = extraction_chain(ocr_text, openai_key,side)
|
|
555
|
-
# return ocr_text,result,side
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
# use response.pareser method to get the side
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
# PROMPT_IDENTIFY_IRQ_SIDE = """You are given OCR text extracted from an identity document. Produce a single JSON object that matches this Pydantic model exactly:
|
|
562
|
-
|
|
563
|
-
# IdentifySideResponse:
|
|
564
|
-
# - is_valid_id: "True" or "" (empty string)
|
|
565
|
-
# - side: "front", "back", or "" (empty string)
|
|
566
|
-
|
|
567
|
-
# Decision rules:
|
|
568
|
-
# - Set is_valid_id = "True" if the OCR clearly belongs to an Iraqi National ID (contains Arabic/Kurdish phrases such as "جمهورية العراق", "وزارة الداخلية", "مديرية الأحوال المدنية", Kurdish equivalents, or clear ID structure like MRZ, DOB/issue/expiry dates, or a plausible Iraqi ID number). Otherwise set "".
|
|
569
|
-
# - Determine side:
|
|
570
|
-
# - "back" if OCR includes MRZ (three MRZ lines or MRZ-like patterns with '<'), or contains dates (DOB/issue/expiry) or MRZ-style date fields.
|
|
571
|
-
# - "front" if OCR contains personal name fields, ID number, national symbols/text but no dates or MRZ.
|
|
572
|
-
# - "" if you cannot confidently classify.
|
|
573
|
-
# - Use exact string values ("True", "", "front", "back") and nothing else.
|
|
574
|
-
# - Output only the JSON object (no explanation, no extra keys, no surrounding text).
|
|
575
|
-
|
|
576
|
-
# Examples:
|
|
577
|
-
# Input OCR:
|
|
578
|
-
# "@@@\nI<IRQ<<<<DOE<<JOHN<<<<<<<<<<<<\n123456789IRQ\nDOB 01/02/1990\nEXP 01/02/2030\nوزارة الداخلية\n"
|
|
579
|
-
# Output:
|
|
580
|
-
# {"is_valid_id":"True","side":"back"}
|
|
581
|
-
|
|
582
|
-
# Input OCR:
|
|
583
|
-
# "جمهورية العراق\nالاسم: محمد احمد\nالرقم الوطني: 123456789\n"
|
|
584
|
-
# Output:
|
|
585
|
-
# {"is_valid_id":"True","side":"front"}
|
|
586
|
-
|
|
587
|
-
# If uncertain about validity or side, prefer empty strings rather than guessing. Return only JSON object"""
|
|
438
|
+
logging.error(f"Error during OCR extraction: {e}")
|
|
439
|
+
return {"error": "OCR extraction failed."}
|
|
440
|
+
response_data = response.dict() if response else {}
|
|
441
|
+
|
|
442
|
+
logging.info(f"Openai response: {json.dumps(response_data, ensure_ascii=False, indent=2)}")
|
|
443
|
+
return response_data
|
|
588
444
|
|
|
589
|
-
# PROMPT_FRONT_IRQ = """
|
|
590
|
-
# You are an expert in reading Iraqi National ID Cards. Extract the following fields from the **front side** of the ID image.
|
|
591
|
-
# OUTPUT FORMAT
|
|
592
|
-
# - Return a single JSON object and nothing else.
|
|
593
|
-
# - Use exactly these keys (string values) in this exact set: first_name, first_name_en, father_name, father_name_en, third_name, third_name_en, last_name, last_name_en, mother_first_name, mother_first_name_en, mother_last_name, mother_last_name_en, gender_ar, gender, id_number, card_number, serial_number, blood_type.
|
|
594
|
-
# - For any field you cannot read or that is not present, return an empty string "".
|
|
595
|
-
# - Do NOT include extra keys, comments, or explanatory text.
|
|
596
|
-
|
|
597
|
-
# PREFERRED EXTRACTION ORDER (must follow this order when resolving ambiguous or multiple name-like values)
|
|
598
|
-
# 1. name: first_name, first_name_en
|
|
599
|
-
# 2. father's name: father_name, father_name_en
|
|
600
|
-
# 3. paternal grandfather name / third name: third_name, third_name_en
|
|
601
|
-
# 4. family/tribal name / last name: last_name, last_name_en
|
|
602
|
-
# 5. mother's name (given / "bidah"): mother_first_name, mother_first_name_en
|
|
603
|
-
# 6. mother's last name: mother_last_name, mother_last_name_en
|
|
604
|
-
# 7. gender: gender_ar then gender
|
|
605
|
-
# 8. blood type: blood_type
|
|
606
|
-
|
|
607
|
-
# FIELD EXTRACTION RULES (high-precision)
|
|
608
|
-
# 1. General:
|
|
609
|
-
# - Prefer the text that is printed directly under or next to the label on the FRONT side. If multiple languages appear, store the Arabic exact text in *_ar fields and the English/transliterated text in *_en fields.
|
|
610
|
-
# - When multiple candidate name-like values exist, choose following the PREFERRED EXTRACTION ORDER above. Do NOT swap order or assign the paternal-grandfather value to the father's slot, etc.
|
|
611
|
-
# - Preserve characters exactly as printed for Arabic fields; do not normalize or transliterate Arabic into Latin unless placed into a *_en field.
|
|
612
|
-
# - Do NOT hallucinate, infer, or guess missing values. If unclear, return "".
|
|
613
|
-
|
|
614
|
-
# 2. Names:
|
|
615
|
-
# - first_name (Arabic): the given name exactly as printed in Arabic on the front.
|
|
616
|
-
# - first_name_en: the same given name transliterated to English (Latin script) exactly as printed or transliterated from Arabic; preserve casing and spaces.
|
|
617
|
-
# - father_name / father_name_en, third_name / third_name_en follow same rules for father and paternal-grandfather.
|
|
618
|
-
# - last_name / last_name_en: family/tribal name if present. If not present, return "" for both.
|
|
619
|
-
# - mother_first_name / mother_first_name_en and mother_last_name / mother_last_name_en: extract mother's given and last names similarly.
|
|
620
|
-
|
|
621
|
-
# 3. Gender:
|
|
622
|
-
# - gender_ar: return the Arabic text exactly as printed (e.g., "ذكر" or "أنثى").
|
|
623
|
-
# - gender: map to English "male" or "female" (lowercase). If ambiguous, return "".
|
|
624
|
-
|
|
625
|
-
# 4. Identification numbers:
|
|
626
|
-
# - id_number: must be exactly the 12 digits printed on the card (do not alter digits, do not insert spaces or separators). If not exactly 12 digits, return "".
|
|
627
|
-
# - card_number: exact 9-character document number as printed (preserve letters/digits).
|
|
628
|
-
# - serial_number: optional 6-character serial if present; else "".
|
|
629
|
-
# - blood_type: optional (e.g., "O+", "A-"); return exactly as printed or "".
|
|
630
|
-
|
|
631
|
-
# 5. Formatting & validation:
|
|
632
|
-
# - Trim surrounding whitespace but do not change internal spacing, punctuation, or letter case for name fields.
|
|
633
|
-
# - If both Arabic and English appear for a name under the same label, assign Arabic text to the *_ar field and English/transliteration to the *_en field.
|
|
634
|
-
|
|
635
|
-
# 4. Do NOT guess or hallucinate any values. If unclear, return empty string.
|
|
636
|
-
|
|
637
|
-
# 5. Return structured JSON output as per schema only.
|
|
638
|
-
# """
|
|
639
|
-
|
|
640
|
-
# PROMPT_BACK_IRQ = """
|
|
641
|
-
# You are an expert in reading Iraqi National ID Cards. Extract the following fields from the **back side** of the ID image.
|
|
642
|
-
|
|
643
|
-
# 1. **Extract MRZ lines (Machine Readable Zone):**
|
|
644
|
-
# - Each line must be exactly 30 characters.
|
|
645
|
-
# - Return as a list of exactly 3 strings (`mrz`), in order.
|
|
646
|
-
# - Keep each line exactly as printed (no padding, no fixing).
|
|
647
|
-
# - Remove all whitespace and punctuation.
|
|
648
|
-
# - Return exact number of '<' characters in each line of mrz.
|
|
649
|
-
|
|
650
|
-
# 2. **Verify IDIRQ prefix:**
|
|
651
|
-
# - If the first line of MRZ starts with 'IDIRQ', return `idirq_verified` as true. Otherwise, false.
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
# 3. **Extract and format these fields:**
|
|
655
|
-
# - `dob_back`, `issue_date`, `expiry_date` in **DD/MM/YYYY** format.
|
|
656
|
-
|
|
657
|
-
# 4. **Extract issuing authority:**
|
|
658
|
-
# - `issuing_authority_ar`: Issuing authority (جهة الاصدار / لايانى ددرجوون) in Arabic, exactly as printed.
|
|
659
|
-
# - `'issuing_authority_en'`: TRANSLATED name of the issuing authority (`issuing_authority_ar`) in English.
|
|
660
|
-
|
|
661
|
-
# 5. **Extract place of birth:**
|
|
662
|
-
# - `place_of_birth_ar`: Place of birth in Arabic as printed on the back
|
|
663
|
-
# - `place_of_birth_en`: Transliterated place of birth (`place_of_birth_ar) into English
|
|
664
|
-
|
|
665
|
-
# 6. **Extract Names**
|
|
666
|
-
# - `first_name_back`: First name extracted from MRZ line 3, after the '<<' seperator."
|
|
667
|
-
# - `last_name_back`: Surname extracted from MRZ line 3, before the '<<' separator." If this is not present, return null.
|
|
668
|
-
|
|
669
|
-
# 7. **Extract Family Number:**
|
|
670
|
-
# - `family_number`: 18-character alphanumeric Family number (الرقم العائلي / ژمارەى خێزانی)' exactly as printed (do not alter).
|
|
671
|
-
|
|
672
|
-
# 8. **Extract Nationality:**
|
|
673
|
-
# - `nationality`: 3-letter ISO nationality code, (e.g. IRQ for Iraq).
|
|
674
|
-
|
|
675
|
-
# 8. **DO NOT GUESS.**
|
|
676
|
-
# - If a field is faint, blurry, or unclear, return empty string.
|
|
677
|
-
|
|
678
|
-
# 9. Return output as JSON according to the defined schema.
|
|
679
|
-
# """
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
# PROMPT_PASSPORT_IRQ = """
|
|
683
|
-
# Extract ALL fields from this Iraqi Passport image with high accuracy.
|
|
684
|
-
|
|
685
|
-
# 1. Extract name English:
|
|
686
|
-
# - `full_name`: Full Name, in English, exactly as printed
|
|
687
|
-
# - Do not add anything from the field 'Surname' into this.
|
|
688
|
-
# - `last_name`: Surname, in English, exactly as printed
|
|
689
|
-
# - `mother_name`: Mother's full name in English, exactly as printed
|
|
690
|
-
|
|
691
|
-
# 2. **Extract place of birth:**
|
|
692
|
-
# - If value of Place of Birth is in English, return exactly as printed.
|
|
693
|
-
# - else, if it is not in English, look at the right-hand side of the passport, where it says "Place of Birth"
|
|
694
|
-
# - Transliterate this place of birth into English, if it is only in Arabic.
|
|
695
|
-
# 3. Parse and extract:
|
|
696
|
-
# - `issuing_authority`, exactly as printed in English.
|
|
697
|
-
# - Transliterate `issuing_authority` to English if it is only in Arabic.
|
|
698
|
-
# - `issuing_country`: country of issuance or country code, exactly as printed in English.
|
|
699
|
-
# - `gender`: Gender/Sex either as Male or Female
|
|
700
|
-
# - `dob`, `issue_date`, `expiry_date` → all in DD/MM/YYYY format
|
|
701
|
-
# - `id_number`: must be 9-character alphanumeric passport number.
|
|
702
|
-
# - `nationality`: use 3-letter ISO format (e.g., IRQ for Iraq, JOR for Jordan)
|
|
703
|
-
|
|
704
|
-
# 4. If only two locations are visible, assign the first to place_of_birth and second to issuing_authority.
|
|
705
|
-
|
|
706
|
-
# 5. Ensure that the fields `mrz1` and `mrz2` strictly follow the below format for passports:
|
|
707
|
-
|
|
708
|
-
# - Both `mrz1` and `mrz2` must be exactly 44 characters long.
|
|
709
|
-
# - Use the `<` symbol for padding, **not spaces or any other characters**.
|
|
710
|
-
# - There should be **no commas, no spaces**, and only uppercase English alphabets, digits, and `<` characters are allowed.
|
|
711
|
-
# - If the line is shorter than 44 characters, pad it **only with `<` symbols at the end**, **except**:
|
|
712
|
-
# - In `mrz2`, the final character is a **check digit** (usually numeric) and must remain the last character. Padding with `<` should be applied **before** this digit.
|
|
713
|
-
# - Do not introduce extra characters to make the string 44 characters. Do not insert `<` between letters or numbers — only at the end (or just before the check digit in `mrz2`).
|
|
714
|
-
# - Do not append any punctuation like commas, periods, or symbols.
|
|
715
|
-
|
|
716
|
-
# Return the lines exactly as shown, with **no trailing whitespace** or formatting.
|
|
717
|
-
|
|
718
|
-
# 5. Do not guess or invent any value. If a field is unclear or missing, return empty string.
|
|
719
|
-
|
|
720
|
-
# 6. Output MUST be a structured JSON following the defined schema.
|
|
721
|
-
# """
|
|
722
|
-
|
|
723
|
-
# class IraqiIDCardFront(BaseModel):
|
|
724
|
-
# first_name: str = Field(..., description="First name (الاسم / ناو) in Arabic.")
|
|
725
|
-
# first_name_en: str = Field(..., description="Transliterate First name (الاسم / ناو), to English.")
|
|
726
|
-
# father_name: str = Field(..., description="Father's name (الأب / باوك) in Arabic.")
|
|
727
|
-
# father_name_en: str = Field(..., description="Transliterate Father's name (الأب / باوك) to English.")
|
|
728
|
-
# third_name: str = Field(..., description="Paternal grandfather's name (الجد / بابير) in Arabic.")
|
|
729
|
-
# third_name_en: str = Field(..., description="Transliterate Paternal grandfather's name (الجد / بابير) to English.")
|
|
730
|
-
# last_name: Optional[str] = Field(
|
|
731
|
-
# "",
|
|
732
|
-
# description=(
|
|
733
|
-
# "Family/tribal name (اللقب / نازناو) in Arabic. "
|
|
734
|
-
# "OCR extracts various versions of 'نازناو' like الزناو, الزنار; do not interpret them as the family name."
|
|
735
|
-
# )
|
|
736
|
-
# )
|
|
737
|
-
# last_name_en: Optional[str] = Field(
|
|
738
|
-
# "",
|
|
739
|
-
# description=(
|
|
740
|
-
# "Transliterate Family/tribal name (اللقب / نازناو) to English. "
|
|
741
|
-
# "OCR extracts various versions of 'نازناو' like الزناو, الزنار; do not interpret them as the family name."
|
|
742
|
-
# )
|
|
743
|
-
# )
|
|
744
|
-
# mother_first_name: str = Field(..., description="Mother's name (الام/ دابك) in Arabic.")
|
|
745
|
-
# mother_first_name_en: str = Field(..., description="Transliterate Mother's name (الام/ دابك) to English.")
|
|
746
|
-
# mother_last_name: str = Field(..., description="Maternal grandfather's name (الجد / بابير) in Arabic.")
|
|
747
|
-
# mother_last_name_en: str = Field(...,
|
|
748
|
-
# description="Transliterate Maternal grandfather's name (الجد / بابير) to English.")
|
|
749
|
-
# gender_ar: str = Field(..., description="Gender (الجنس / ردگار): ذكر (male) or أنثى (female).")
|
|
750
|
-
# gender: str = Field(..., description="Translate Gender (الجنس / ردگار) to English")
|
|
751
|
-
# id_number: str = Field(..., description="12-digit national ID number.")
|
|
752
|
-
# card_number: str = Field(..., description="9-character alphanumeric document number.")
|
|
753
|
-
# serial_number: Optional[str] = Field("", description="6-digit card serial number.")
|
|
754
|
-
# blood_type: Optional[str] = Field(None, description="Blood type (e.g., O+, A-).")
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
# class IraqiIDCardBack(BaseModel):
|
|
759
|
-
# issuing_authority_ar: str = Field(..., description="Issuing authority (جهة الاصدار / لايانى ددرجوون) in Arabic")
|
|
760
|
-
# issuing_authority_en: str = Field(..., description="TRANSLATE Issuing authority into English")
|
|
761
|
-
# issue_date: str = Field(..., description="Issue date in DD/MM/YYYY format")
|
|
762
|
-
# expiry_date: str = Field(..., description="Expiry date in DD/MM/YYYY format")
|
|
763
|
-
# place_of_birth_ar: str = Field(..., description="Place of birth in Arabic.")
|
|
764
|
-
# place_of_birth_en: str = Field(..., description="Transliterated Place of birth into English.")
|
|
765
|
-
# dob: str = Field(..., description="Date of birth in DD/MM/YYYY format")
|
|
766
|
-
# family_number: str = Field(...,
|
|
767
|
-
# description='18-character alphanumeric Family number (الرقم العائلي / ژمارەى خێزانی) exactly as printed (do not alter).')
|
|
768
|
-
# mrz: List[str] = Field(..., min_items=3, max_items=3,
|
|
769
|
-
# description="List of 3 MRZ lines. Each line must be exactly as printed on the ID (30 characters, unaltered).")
|
|
770
|
-
# first_name_back: str = Field(..., description="Given name extracted from MRZ line 3, after the '<<' seperator.")
|
|
771
|
-
# last_name_back: Optional[str] = Field(...,
|
|
772
|
-
# description="Surname extracted from MRZ line 3, before the '<<' separator. If this is not present, return null.")
|
|
773
|
-
# idirq_verified: bool = Field(..., description="True if the first MRZ line starts with 'IDIRQ'")
|
|
774
|
-
# nationality: str = Field(..., description="3-letter nationality code (e.g., IRQ for Iraq)")
|
|
775
|
-
|
|
776
|
-
# @validator("idirq_verified", always=True)
|
|
777
|
-
# def check_idirq(cls, v, values):
|
|
778
|
-
# mrz = values.get("mrz", [])
|
|
779
|
-
# return bool(mrz and mrz[0].startswith("IDIRQ"))
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
# class IraqiPassport(BaseModel):
|
|
783
|
-
# full_name: str = Field(..., description="The Full Name, in English, exactly as printed on the document")
|
|
784
|
-
# last_name: str = Field(..., description="Surname of the person on the passport")
|
|
785
|
-
# place_of_birth: str = Field(..., description=("If Place of Birth is in English, return exactly as printed."
|
|
786
|
-
# "If not present in English, look at the right-hand side of the passport, where it says 'Place of Birth'."
|
|
787
|
-
# "Transliterate to English if value of Place of Birth is only in Arabic"))
|
|
788
|
-
|
|
789
|
-
# issuing_authority: str = Field(..., description=("Place of passport issuance in English"
|
|
790
|
-
# "Transliterate to English if issuing authority is only in Arabic"))
|
|
791
|
-
# issuing_country: str = Field(..., description="Issuing Country/Country Code (e.g. 'IRQ', 'JOR')", example='IRQ')
|
|
792
|
-
# mother_name: str = Field(..., description="Mother's full name in English, exactly as printed.")
|
|
793
|
-
# gender: str = Field(..., description="printed as Sex: M or F return 'Male' or 'Female' accordingly")
|
|
794
|
-
# mrz1: str = Field(..., min_length=44, max_length=44,
|
|
795
|
-
# description="First line of the MRZ, exactly 44 characters, padded with '<' at the end if shorter")
|
|
796
|
-
# mrz2: str = Field(..., min_length=44, max_length=44,
|
|
797
|
-
# description="Second line of the MRZ, exactly 44 characters. Padding with '<' must be inserted before the final check digit.")
|
|
798
|
-
# id_number: str = Field(..., pattern=r"^[A-Z][0-9]{8}$",
|
|
799
|
-
# description="Passport number: one uppercase letter followed by 8 digits")
|
|
800
|
-
|
|
801
|
-
# dob: str = Field(
|
|
802
|
-
# ..., description="Date of birth in DD/MM/YYYY format"
|
|
803
|
-
# )
|
|
804
|
-
# issue_date: str = Field(
|
|
805
|
-
# ..., description="Issue date in DD/MM/YYYY format"
|
|
806
|
-
# )
|
|
807
|
-
# expiry_date: str = Field(
|
|
808
|
-
# ..., description="Expiry date in DD/MM/YYYY format"
|
|
809
|
-
# )
|
|
810
|
-
# nationality: str = Field(
|
|
811
|
-
# ..., description="Nationality in ISO 3166-1 alpha-3 format (e.g., SDN)"
|
|
812
|
-
# )
|
|
813
|
-
|
|
814
|
-
# header_verified: bool = Field(
|
|
815
|
-
# ..., description="True if document header ('IRQ', 'Republic of Iraq') is detected"
|
|
816
|
-
# )
|
|
817
|
-
|
|
818
|
-
# class IdentifyIRQSideResponse(BaseModel):
|
|
819
|
-
# is_valid_id: bool = Field(..., description="Return True if document is either a valid Iraqi National ID's front side or back side."
|
|
820
|
-
# "It should contain Arabic/Kurdish text like: جمهورية العراق / وزارة الداخلية"
|
|
821
|
-
# "مديرية الأحوال المدنية والجوازات والاقامة"
|
|
822
|
-
# "کوماری عیراق / وه زاره تی ناوخو"
|
|
823
|
-
# "پریود به را بائی باری شارستانی و پاسپورت و نیشنگه"
|
|
824
|
-
# "جمهورية العراق / وزارة الداخلية"
|
|
825
|
-
# "کوماری عیراق / وه زاره تی ناوخو"
|
|
826
|
-
# "Return empty string '' otherwise.")
|
|
827
|
-
# # side should be one of the front, back or empty string
|
|
828
|
-
# side: str = Field(..., description="Determine if this is a front side or back side of an Iraqi National ID. Return empty string if its neither."
|
|
829
|
-
# "A back side has three lines of MRZ, has dates of birth, issue and expiry"
|
|
830
|
-
# "A front side has names, and id number. No dates. return front or back accordingly.")
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
# def _image_to_jpeg_bytesio(image) -> BytesIO:
|
|
835
|
-
# """
|
|
836
|
-
# Accepts: numpy.ndarray (OpenCV BGR), PIL.Image.Image, bytes/bytearray, or io.BytesIO
|
|
837
|
-
# Returns: io.BytesIO containing JPEG bytes (ready for get_openai_response)
|
|
838
|
-
# """
|
|
839
|
-
# import numpy as np
|
|
840
|
-
|
|
841
|
-
# if isinstance(image, BytesIO):
|
|
842
|
-
# image.seek(0)
|
|
843
|
-
# return image
|
|
844
|
-
|
|
845
|
-
# if isinstance(image, (bytes, bytearray)):
|
|
846
|
-
# return BytesIO(image)
|
|
847
|
-
|
|
848
|
-
# try:
|
|
849
|
-
# from PIL.Image import Image as _PILImage
|
|
850
|
-
|
|
851
|
-
# if isinstance(image, _PILImage):
|
|
852
|
-
# buf = BytesIO()
|
|
853
|
-
# image.convert("RGB").save(buf, format="JPEG", quality=95)
|
|
854
|
-
# buf.seek(0)
|
|
855
|
-
# return buf
|
|
856
|
-
# except Exception:
|
|
857
|
-
# pass
|
|
858
|
-
|
|
859
|
-
# if isinstance(image, np.ndarray):
|
|
860
|
-
# success, enc = cv2.imencode(".jpg", image)
|
|
861
|
-
# if not success:
|
|
862
|
-
# raise ValueError("cv2.imencode failed")
|
|
863
|
-
# return BytesIO(enc.tobytes())
|
|
864
|
-
|
|
865
|
-
# raise TypeError(
|
|
866
|
-
# "Unsupported image type. Provide numpy.ndarray, PIL.Image.Image, bytes, or io.BytesIO."
|
|
867
|
-
# )
|
|
868
|
-
|
|
869
|
-
# def get_irq_side_from_openai(image, openai_key):
|
|
870
|
-
|
|
871
|
-
# logging.info(f"Getting side of Iraqi ID from OpenAI... and type of image {type(image)}")
|
|
872
|
-
# base_64_image = _image_to_jpeg_bytesio(image)
|
|
873
|
-
# b64_image = base64.b64encode(base_64_image.getvalue()).decode("utf-8")
|
|
874
|
-
|
|
875
|
-
# logging.info(f"Converted image to JPEG BytesIO for OpenAI processing. type of base_64_image {type(b64_image)}")
|
|
876
|
-
# for attempt in range(3):
|
|
877
|
-
# try:
|
|
878
|
-
# client = OpenAI(api_key=openai_key)
|
|
879
|
-
# # image_data = base64.b64decode(b64_image)
|
|
880
|
-
# response = client.responses.parse(
|
|
881
|
-
# model="gpt-4.1-mini",
|
|
882
|
-
# input = [{"role": "system", "content": "You are an expert at extracting information from identity documents, extract data as per fields, dont use any additional text or infer from mrz data."},
|
|
883
|
-
# {"role": "user", "content": [
|
|
884
|
-
# {"type": "input_text", "text": PROMPT_IDENTIFY_IRQ_SIDE},
|
|
885
|
-
# {"type": "input_image", "image_url": f"data:image/jpeg;base64,{b64_image}", "detail": "low"},
|
|
886
|
-
# ]},
|
|
887
|
-
# ],
|
|
888
|
-
# text_format = IdentifyIRQSideResponse
|
|
889
|
-
# )
|
|
890
|
-
# logging.info(f"Received response from OpenAI for side identification., {response.output_parsed}")
|
|
891
|
-
|
|
892
|
-
# return vars(response.output_parsed)
|
|
893
|
-
|
|
894
|
-
# except Exception as e:
|
|
895
|
-
# logging.error(f"Error in get_side_from_openAI attempt {attempt + 1}: {e}")
|
|
896
|
-
# time.sleep(2)
|
|
897
|
-
# return {"is_valid_id": "", "side": ""}, b64_image
|
|
898
|
-
|
|
899
|
-
# def get_openai_response_irq(prompt: str, model_type, image: BytesIO, genai_key):
|
|
900
|
-
|
|
901
|
-
# for attempt in range(3):
|
|
902
|
-
# try:
|
|
903
|
-
# client = OpenAI(api_key=genai_key)
|
|
904
|
-
# response = client.responses.parse(
|
|
905
|
-
# model="gpt-4.1-mini",
|
|
906
|
-
# input=[
|
|
907
|
-
# {"role": "system",
|
|
908
|
-
# "content": "You are an expert at extracting information from identity documents."},
|
|
909
|
-
# {"role": "user", "content": [
|
|
910
|
-
# {"type": "input_text", "text": prompt},
|
|
911
|
-
# {"type": "input_image", "image_url": f"data:image/jpeg;base64,{image}", "detail": "low"},
|
|
912
|
-
# ]},
|
|
913
|
-
# ],
|
|
914
|
-
# text_format=model_type,
|
|
915
|
-
# )
|
|
916
|
-
# return response.output_parsed
|
|
917
|
-
# except Exception as e:
|
|
918
|
-
# logging.info(f"[ERROR] Attempt {attempt + 1} failed: {str(e)}")
|
|
919
|
-
# time.sleep(2)
|
|
920
|
-
# return None
|
|
921
|
-
|
|
922
|
-
# def process_image_irq(side):
|
|
923
|
-
# if side == "front":
|
|
924
|
-
# prompt = PROMPT_FRONT_IRQ
|
|
925
|
-
# model = IraqiIDCardFront
|
|
926
|
-
|
|
927
|
-
# elif side == "back":
|
|
928
|
-
# prompt = PROMPT_BACK_IRQ
|
|
929
|
-
# model = IraqiIDCardBack
|
|
930
|
-
|
|
931
|
-
# elif side == "passport":
|
|
932
|
-
# prompt = PROMPT_PASSPORT_IRQ
|
|
933
|
-
# model = IraqiPassport
|
|
934
|
-
# else:
|
|
935
|
-
# raise ValueError("Invalid document side specified. Use 'front', 'back', or 'passport'.")
|
|
936
|
-
|
|
937
|
-
# return model, prompt
|
|
938
|
-
|
|
939
|
-
# def get_response_from_openai_irq(image, side, openai_key):
|
|
940
|
-
# logging.info(f"Getting response from OpenAI for Iraqi Id side {side}... and type of image {type(image)}")
|
|
941
|
-
# try:
|
|
942
|
-
# base_64_image = _image_to_jpeg_bytesio(image)
|
|
943
|
-
# b64_image = base64.b64encode(base_64_image.getvalue()).decode("utf-8")
|
|
944
|
-
# logging.info(f"Converted image to JPEG BytesIO for OpenAI processing. type of base_64_image {type(b64_image)}")
|
|
945
|
-
# except Exception as e:
|
|
946
|
-
# logging.error(f"Error converting image: {e}")
|
|
947
|
-
# return {"error": "Image conversion failed"}
|
|
948
|
-
# try:
|
|
949
|
-
# model, prompt = process_image_irq(side)
|
|
950
|
-
# logging.info(f"Using model: {model.__name__} and prompt {prompt[:100]}")
|
|
951
|
-
# except ValueError as ve:
|
|
952
|
-
# logging.error(f"Error: {ve}")
|
|
953
|
-
# return {"error": str(ve)}
|
|
954
|
-
|
|
955
|
-
# try:
|
|
956
|
-
# response = get_openai_response_irq(prompt, model, b64_image, openai_key)
|
|
957
|
-
# except Exception as e:
|
|
958
|
-
# logging.error(f"Error during OpenAI request: {e}")
|
|
959
|
-
# return {"error": "OpenAI request failed"}
|
|
960
|
-
|
|
961
|
-
# response_data = vars(response)
|
|
962
|
-
# logging.info(f"Openai response: {response}")
|
|
963
|
-
# return response_data
|