idvpackage 3.0.11__py3-none-any.whl → 3.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,220 +1,319 @@
1
+ import base64
1
2
  import json
2
- import time
3
- import datetime
4
- import openai
5
- from langchain.tools import tool
6
- from langchain.tools.render import format_tool_to_openai_function
7
- from langchain.prompts import ChatPromptTemplate
8
- from langchain.chat_models import ChatOpenAI
9
- from pydantic import BaseModel, Field, validator
10
- from langchain.utils.openai_functions import convert_pydantic_to_openai_function
11
- from typing import Optional, Literal
12
- from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
13
- from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
14
- import idvpackage.genai_utils as genai_utils
15
- import idvpackage.genai_utils as sanity_utils
16
- from datetime import datetime, timedelta
17
- from dateutil.relativedelta import relativedelta
18
- from pydantic import ValidationError
19
3
  import logging
20
- from langchain.schema.agent import AgentFinish
21
- # import base64
22
- # import time
23
- # from io import BytesIO
24
- # from typing import Set, List, Optional
25
- # import json
26
- # import cv2
27
- # import torch
28
- # from PIL import Image
29
- # from openai import OpenAI
30
- # from pydantic import BaseModel, Field, validator
31
- # import logging
32
-
33
-
34
-
35
- # logging.basicConfig(
36
- # level=logging.INFO,
37
- # format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s',
38
- # datefmt='%Y-%m-%d %H:%M:%S',
39
- # force=True
40
- # )
41
-
42
- class Verify_IRQ_Passport(BaseModel):
43
- """Validates whether a given OCR text represents a valid Iraqi Passport"""
44
- is_valid_id: Literal["True", "False"] = Field(..., description="Return True if document is a valid Iraqi Passport"
45
- "It should contain Arabic/Kurdish text like: جمهورية العراق, کۆماری عێراق and English Text: Republic of Iraq"
46
- "Return False otherwise.")
47
- side: Literal["passport", ""] = Field(..., description="Return passport if the document is a valid Iraqi Passport")
4
+ import time
48
5
 
49
- class Iraq_Passport(BaseModel):
50
- """Extract the fields from the OCR extracted text of an Iraqi Passport"""
51
- full_name: str = Field(..., description="Full name of the person on the passport")
52
- last_name: str = Field(..., description="Surname of the person on the passport")
53
- dob: str = Field(..., description="Date of Birth")
54
- place_of_birth: str = Field(...,
55
- description=(
56
- "Place of Birth of the person on the passport"
57
- "DO NOT mix it up with Issuing Authority"
58
- "Translate to English"
59
- )
60
- )
61
- mother_name: str = Field(..., description="Mother's full name")
62
- gender_letter: str = Field(..., description="Gender/Sex of the person on the passport. It is either M or F.")
63
- issuing_authority: str = Field(...,
64
- description=(
65
- "Issuing Authority"
66
- "Translate to English"
67
- )
68
- )
69
- nationality: str = Field(..., description="Nationality in ISO 3166-1 alpha-3 format (e.g., 'IRQ' for Iraqi, 'JOR' for Jordanian)", example="IRQ")
70
- issuing_country: str = Field(..., description="Issuing Country/Country Code (e.g. 'IRQ', 'JOR')", example='IRQ')
71
- id_number: str = Field(..., description="9-character alphanumeric passport number.")
72
- mrz1: str = Field(...,
73
- description=(
74
- "MRZ Line 1."
75
- "Should be exactly 44 characters long."
76
- "If OCR splits it across lines, join them into one."
77
- "Do not confuse with MRZ Line 2 — Line 1 typically starts with 'P<' and contains names."
78
- )
79
- )
80
-
81
- mrz2: str = Field(...,
82
- description=(
83
- "MRZ Line 2."
84
- "Should be exactly 44 characters long."
85
- "If OCR splits it across lines, join them into one string."
86
- "Do not confuse with MRZ Line 1 — Line 2 contains passport number, nationality, DOB, expiry, etc."
87
- )
88
- )
89
-
90
- @validator("mrz2")
91
- def validate_mrz2_content_length(cls, v):
92
- if len(v.replace('<', '')) < 28:
93
- raise ValueError("cropped_mrz")
94
- return v
95
-
96
-
97
- @tool(args_schema=Iraq_Passport)
98
- def sanity_check_irq_passport(full_name='',
99
- last_name='',
100
- dob='',
101
- place_of_birth='',
102
- mother_name='',
103
- gender_letter='',
104
- issuing_authority='',
105
- nationality='',
106
- issuing_country='',
107
- id_number='',
108
- mrz='',
109
- mrz1='',
110
- mrz2=''):
111
- try:
112
6
 
113
- # if len(mrz1)<44:
114
- # return {'error':'covered_photo','error_details':'cropped mrz'}
115
- #
116
- # if len(mrz2)<44:
117
- # return {'error': 'covered_photo', 'error_details': 'cropped mrz'}
7
+ from openai import OpenAI
8
+ from pydantic import BaseModel, Field
118
9
 
119
- # if len(mrz2.replace('<',''))<30:
120
- # return {'error': 'covered_photo', 'error_details': 'cropped mrz'}
121
10
 
122
11
 
123
- mrz = mrz1 + mrz2
12
+ PROMPT_PASSPORT_IRQ = """
13
+ You are given an image of an Iraqi passport. Your task is to read the passport and return ONLY a valid JSON object matching the IraqiPassport schema.
124
14
 
15
+ Follow all rules exactly. Do NOT guess or invent any value.
125
16
 
17
+ ================================================
18
+ 1. LANGUAGE RULES
19
+ ================================================
126
20
 
127
- id_number = mrz2[0:9]
21
+ - Read all visible text in English, Arabic, and Kurdish.
22
+ - Prefer English if it exists.
23
+ - If the **field label is in English but the VALUE is in Arabic/Kurdish**,
24
+ → transliterate the VALUE into English letters.
25
+ - If a field appears ONLY in Arabic/Kurdish:
26
+ → transliterate it to English letters (romanization).
27
+ - Do NOT translate or change meaning.
28
+ - If any value is unreadable or missing → return "".
128
29
 
129
- if id_number[0] == '8':
130
- id_number = 'B' + id_number[1:]
30
+ ================================================
31
+ 2. FIELDS TO EXTRACT (IraqiPassport schema)
32
+ ================================================
131
33
 
132
- expiry_date = mrz2.replace(" ", "")[21:27]
133
- expiry_date = sanity_utils.parse_yymmdd(expiry_date) # string 'YYYY-MM-DD'
134
- is_doc_expired = sanity_utils.is_expired_id(expiry_date)
34
+ You must fill ALL fields:
135
35
 
136
- if is_doc_expired:
137
- return {"error": "expired_id", "error_details": "expired ID"}
36
+ 1) Names:
37
+ - `full_name`: English full name exactly as printed.
38
+ - `last_name`: English surname exactly as printed.
39
+ - `mother_name`: Mother's name in English.
40
+ - If any of these appear only in Arabic/Kurdish → transliterate the value.
138
41
 
139
- # Reuse expiry_date for datetime object temporarily for calculations
140
- expiry_date_obj = datetime.strptime(expiry_date, "%Y-%m-%d")
141
- issue_date = (expiry_date_obj - relativedelta(years=8) + timedelta(days=1)).strftime("%Y-%m-%d")
142
- try:
143
- dob = sanity_utils.convert_dob_to_standard(dob)
144
- expiry_date = sanity_utils.convert_dob_to_standard(expiry_date)
145
- except Exception as e:
146
- return {"error": "covered_photo", "error_details": "Exception Thrown while parsing dates: {e}"}
147
-
148
-
149
-
150
- if gender_letter.lower() not in ['m','f','male','female']:
151
- gender_letter = mrz2[20]
152
-
153
- if gender_letter.lower()=='m':
154
- gender = 'Male'
155
- elif gender_letter.lower()=='f':
156
- gender = 'Female'
157
-
158
-
159
- optional_fields = [gender_letter]
160
- required_fields = {k: v for k, v in locals().items() if k not in optional_fields}
161
-
162
- missing = [key for key, value in required_fields.items() if not str(value).strip()]
163
- if missing:
164
- return {
165
- 'error': 'covered_photo',
166
- 'error_details': f'Missing or empty fields: {", ".join(missing)}'
167
- }
168
- result = {
169
- "error": "",
170
- "error_details": "",
171
- "doc_type":"passport",
172
- **locals()
173
- }
174
-
175
- if 'expiry_date_obj' in result.keys():
176
- del result['expiry_date_obj']
177
- if 'optional_fields' in result.keys():
178
- del result['optional_fields']
179
- if 'required_fields' in result.keys():
180
- del result['required_fields']
181
- if 'missing' in result.keys():
182
- del result['missing']
183
- return result
184
- except Exception as e:
185
- return {'error':'covered_photo','error_details':e}
186
-
187
-
188
- class Verify_IRQ_ID(BaseModel):
189
- """Validates whether a given OCR text represents a valid Iraqi National ID (either front or back side)."""
190
- is_valid_id: Literal["True", ""] = Field(..., description="Return True if document is either a valid Iraqi National ID's front side or back side."
191
- "It should contain Arabic/Kurdish text like: جمهورية العراق / وزارة الداخلية"
192
- "مديرية الأحوال المدنية والجوازات والاقامة"
193
- "کوماری عیراق / وه زاره تی ناوخو"
194
- "پریود به را بائی باری شارستانی و پاسپورت و نیشنگه"
195
- "جمهورية العراق / وزارة الداخلية"
196
- "کوماری عیراق / وه زاره تی ناوخو"
197
- "Return empty string '' otherwise.")
198
- side: Literal["front","back",""] = Field(..., description="Determine from the given ocr text, if this is a front side or back side of an Iraqi National ID. Return empty string if its neither."
199
- "A back side has three lines of MRZ, has dates of birth, issue and expiry"
200
- "A front side has names, and id number. No dates.")
42
+ 2) Place of Birth:
43
+ - `place_of_birth`:
44
+ - If English value exists return it exactly.
45
+ - If value is shown in Arabic/Kurdish (even beside English label)
46
+ transliterate only the VALUE into English letters.
47
+
48
+ 3) Issuing Authority:
49
+ - `Issuing_authority`:
50
+ - If English value exists → return exactly.
51
+ - If the value under the English label is Arabic/Kurdish
52
+ → transliterate only the VALUE into English letters.
53
+
54
+ 4) Issuing Country:
55
+ - `issuing_country`: Use English country name or code (e.g. IRQ).
56
+ - If only Arabic/Kurdish appears → transliterate and map correctly.
57
+
58
+ 5) Gender:
59
+ - `gender_letter`: “M” or “F” only.
60
+
61
+ 6) Dates:
62
+ - `dob`, `issue_date`, `expiry_date`:
63
+ - Convert to DD/MM/YYYY format.
64
+ - If unclear → return empty string.
65
+
66
+ 7) Passport Number:
67
+ - `id_number`: Must be 1 uppercase letter + 8 digits.
68
+ - If not clearly readable → return "".
69
+
70
+ 8) Nationality:
71
+ - `nationality`: Use 3-letter ISO format (e.g., IRQ).
72
+ - If only Arabic/Kurdish appears (e.g. “عراقي”) → transliterate then map.
73
+
74
+ 9) Fallback rule:
75
+ - If ONLY two locations appear (in Arabic/Kurdish) without labels:
76
+ - 1st → place_of_birth
77
+ - 2nd → issuing_authority
78
+
79
+ 10) Header:
80
+ - Return True if document is a valid Iraqi Passport"
81
+ "It should contain Arabic/Kurdish text like: جمهورية العراق, کۆماری عێراق and English Text: Republic of Iraq"
82
+ "Return False otherwise.
83
+ ================================================
84
+ 3. MRZ RULES
85
+ ================================================
86
+
87
+ - `mrz1` and `mrz2` must be EXACTLY 44 characters.
88
+ - Allowed: A–Z, 0–9, `<`.
89
+ - No spaces or punctuation.
90
+ - If a line is shorter → pad with `<` at the END.
91
+ - In `mrz2`: final char is check digit; pad BEFORE it.
92
+ - Do not add, remove, or modify characters except padding.
93
+ - date of birth and expiry dates in MRZ must be in the format DD/MM/YYYY without any separators.
94
+ - passport number in MRZ must be exactly 9 characters (1 letter + 8 digits), padded with `<` if necessary.
95
+ - gender_mrz: extract gender from MRZ line 2 if M return `MALE`, if F return `FEMALE`
96
+ - expiry_date_mrz: extract expiry date from MRZ line 2 in DD/MM/YYYY format
97
+ ================================================
98
+ 4. NO GUESSING & FINAL OUTPUT
99
+ ================================================
100
+
101
+ - If any field is missing or unreadable return an empty string.
102
+ - Do NOT infer values.
103
+ - Output ONLY a single JSON object matching the IraqiPassport schema.
104
+
105
+
106
+ """
107
+ class IraqiPassport(BaseModel):
108
+ # ocr_text: str = Field(..., description="Full OCR extracted text from the Iraqi Passport image.")
109
+ full_name: str = Field(..., description="The Full Name, in English, exactly as printed on the document")
110
+ last_name: str = Field(..., description="Surname of the person on the passport")
111
+ place_of_birth: str = Field(..., description=("If Place of Birth is in English, return exactly as printed."
112
+ "If not present in English, look at the right-hand side of the passport, where it says 'Place of Birth'."
113
+ "Transliterate to English if value of Place of Birth is only in Arabic"))
114
+
115
+ issuing_authority: str = Field(..., description=("Place of passport issuing authority in English"
116
+ "Transliterate to English if issuing authority is only in Arabic"))
117
+ issuing_country: str = Field(..., description="Issuing Country/Country Code (e.g. 'IRQ', 'JOR')", example='IRQ')
118
+ mother_name: str = Field(..., description="Mother's full name in English, exactly as printed.")
119
+ gender_letter: str = Field(..., description="Sex: M or F")
120
+ mrz1: str = Field(..., min_length=44, max_length=44,
121
+ description="First line of the MRZ, exactly 44 characters, padded with '<' at the end if shorter")
122
+ mrz2: str = Field(..., min_length=44, max_length=44,
123
+ description="Second line of the MRZ, exactly 44 characters. Padding with '<' must be inserted before the final check digit.")
124
+ id_number: str = Field(..., pattern=r"^[A-Z][0-9]{8}$",
125
+ description="Passport number: one uppercase letter followed by 8 digits")
126
+
127
+ dob: str = Field(
128
+ ..., description="Date of birth in DD/MM/YYYY format"
129
+ )
130
+ issue_date: str = Field(
131
+ ..., description="Issue date in DD/MM/YYYY format"
132
+ )
133
+ expiry_date: str = Field(
134
+ ..., description="Expiry date in DD/MM/YYYY format"
135
+ )
136
+ nationality: str = Field(
137
+ ..., description="Nationality in ISO 3166-1 alpha-3 format (e.g., SDN)"
138
+ )
139
+
140
+ header_verified: bool = Field(
141
+ ..., description="Return True if document is a valid Iraqi Passport"
142
+ "It should contain Arabic/Kurdish text like: جمهورية العراق, کۆماری عێراق and English Text: Republic of Iraq"
143
+ "Return False otherwise."
144
+ )
145
+ dob_mrz: str = Field(
146
+ ..., description="Date of birth as extracted from MRZ (in DD/MM/YYYY format)"
147
+ )
148
+ id_number_mrz: str = Field(
149
+ ..., min_length=9, max_length=9, description="ID number as extracted from MRZ"
150
+ )
151
+ expiry_date_mrz: str = Field(
152
+ ..., description="Expiry date as extracted from MRZ (in DD/MM/YYYY format)"
153
+ )
154
+ gender_mrz: str = Field(
155
+ ..., description="Gender as extracted from MRZ (M or F) if M return MALE else if F return FEMALE"
156
+ )
201
157
 
158
+ PROMPT_FRONT_IRQ = """
159
+ You will receive an image of the **front side** of an Iraqi National ID Card.
160
+ Follow ALL steps strictly. Do NOT skip, modify, assume, or guess any information.
161
+
162
+ 1.OCR (MANDATORY)
163
+ Perform full OCR on the entire image.
164
+ Extract ALL visible Arabic, Kurdish, numbers, and symbols EXACTLY as printed.
165
+ Keep line breaks, spacing, spelling, diacritics, and punctuation.
166
+ Do NOT correct, normalize, or reorder text.
167
+ Return the full OCR text in: raw_ocr_text.
168
+
169
+ 2.PERSONAL DETAILS EXTRACTION AND ALL THE INFORMATION REQUESTED IS PRESENT ON THE FRONT SIDE OF THE ID CARD.
170
+ From the OCR text, extract the following fields EXACTLY as printed:
171
+ Extract these fields EXACTLY as printed.
172
+ The ID card contains only Arabic/Kurdish text, so the *_en fields must be generated by translating the Arabic/Kurdish name into its English form.
173
+ This translation is REQUIRED and is NOT considered guessing.
174
+ Only leave *_en fields empty if the Arabic field itself is empty.
175
+
176
+
177
+ - first_name: Arabic first name
178
+ - first_name_en: translate the first name into English
179
+ - father_name: Arabic father name
180
+ - father_name_en: translate the father name into English
181
+ - third_name: Arabic paternal grandfather name
182
+ - third_name_en: translate the paternal grandfather name into English
183
+ - last_name: Arabic family/tribal name (empty string if missing)
184
+ - last_name_en: translate the family/tribal name into English (empty string if missing)
185
+ - mother_first_name: Arabic mother first name
186
+ - mother_first_name_en: translate the mother first name into English
187
+ - mother_last_name: Arabic maternal grandfather/mother’s last name
188
+ - mother_last_name_en: translate the maternal grandfather/mother’s last name into English
189
+
190
+ 3.Gender:
191
+ You MUST read gender ONLY from the Arabic field labeled "الجنس" or "جنس".
192
+ Do NOT infer gender from the name or photo.
193
+
194
+ - gender_ar: ذكر o
195
+ r أنثى
196
+ - gender: Male or Female
197
+
198
+ 4.DOCUMENT DETAILS EXTRACTION
199
+ Extract all document identifiers exactly as printed:
200
+ - id_number: 12-digit National ID Number
201
+ - card_number: 9-character card/document number
202
+ - serial_number: 6-character serial is present vertically (empty if missing)
203
+ - blood_type: e.g., O+, A-, AB+ (empty if missing)
204
+
205
+ 5.HEADER VERIFICATION
206
+ Check if the header is present:
207
+ - Must contain at least one of the following Arabic/Kurdish issuing authority texts:
208
+ "جمهورية العراق"
209
+ "وزارة الداخلية"
210
+ "مديرية الأحوال المدنية والجوازات والاقامة"
211
+ "کوماری عیراق"
212
+ "وه زاره تی ناوخو"
213
+ "پریود به را بائی باری شارستانی و پاسپورت و نیشنگه"
214
+ - Set `header_verified = true` only if at least one appears clearly.
215
+ - Otherwise set `header_verified = false`.
216
+
217
+ 6.VALIDATION RULES
218
+
219
+ - Do NOT guess, assume, or hallucinate.
220
+ - If a field is missing, unclear, faint, or unreadable → return empty string.
221
+ - Output MUST match the schema exactly.
222
+
223
+
224
+ 7.OUTPUT FORMAT
225
+
226
+ Return ONLY structured JSON.
227
+ No explanations. No commentary.
228
+ """
229
+
230
+
231
+
232
+ PROMPT_BACK_IRQ = """
233
+ You will be provided with an image showing the **front side of an Iraqi National ID Card**.
234
+ Follow every instruction with absolute precision. Do NOT infer, assume, or guess any information.
235
+
236
+ Extract EVERY visible Arabic, Kurdish, English, number, and symbol EXACTLY as printed.
237
+ You MUST capture even faint, blurred, low-contrast, small, or partially visible text.
238
+
239
+ O1.oCR Requirements:
240
+ - Preserve original line breaks and spacing
241
+ - Preserve punctuation, diacritics, and character shapes
242
+ - Preserve text even if rotated, tilted, or near the edges
243
+ - Do NOT correct spelling or normalize text
244
+ - Do NOT merge or reorder lines
245
+ - Do NOT skip faint or partial text
246
+
247
+ Return the full raw OCR output exactly as extracted in: raw_ocr_text.
248
+
249
+ 2. MRZ (Machine Readable Zone)
250
+ - mrz1: first MRZ line (exactly as printed, 30 characters, keep '<', remove spaces around it)
251
+ - mrz2: second MRZ line (same rule, 30 characters)
252
+ - mrz3: third MRZ line (same rule, 30 characters)
253
+ If any MRZ line is missing, set it to "".
254
+
255
+ 3.HEADER VERIFICATION
256
+ - header_verified: true if "IDIRQ" appears anywhere in mrz1, otherwise false.
257
+
258
+ 4. DATE FIELDS (DD/MM/YYYY)
259
+ From raw_ocr_text, extract:
260
+ - dob: date of birth
261
+ - issue_date
262
+ - expiry_date
263
+ Dates must be in DD/MM/YYYY format.
264
+ If a date is missing or unclear, return "" for that field.
265
+
266
+ 5.ISSUING AUTHORITY
267
+ - issuing_authority_ar: Arabic text of the issuing authority exactly as printed
268
+ (often like "مديرية الأحوال المدنية - [City]")
269
+ - issuing_authority_en: English translation of the issuing authority
270
+
271
+ 6. PLACE OF BIRTH
272
+
273
+ - place_of_birth: Arabic text
274
+ - place_of_birth_en: translation
275
+
276
+ 7. NAMES FROM MRZ LINE 3
277
+ From mrz3 (if present):
278
+ - last_name_back: the text before '<<'
279
+ - first_name_back: the text after '<<'
280
+ If mrz3 is empty, both fields must be "".
281
+
282
+ 8.FAMILY NUMBER
283
+ - family_number: 18-character alphanumeric value if present in raw_ocr_text, else "".
284
+ -family number_en: same as family_number.
285
+
286
+ 9. NATIONALITY
287
+ - nationality: 3-letter code (e.g., "IRQ") if present, else "".
288
+ 10.GENDER FROM MRZ LINE 2
289
+ - gender_mrz: 'M' or 'F' extracted from mrz2. If mrz2 is missing, return "".
290
+ 11.dob_mrz: date of birth from mrz2 in DD/MM/YYYY format.first six characters from second line mrz. If mrz2 is missing, return "".
291
+ 12.expiry_date_mrz: expiry date from mrz2 in DD/MM/YYYY format. If mrz2 is missing, return "".
292
+ 13.card_number_mrz: document number from mrz1. If mrz1 is missing, return "".
293
+
294
+ RULES:
295
+ - Never guess or infer. If something is not clearly present in raw_ocr_text, return "".
296
+ - Return ONLY a JSON object matching the schema exactly.
297
+
298
+ """
202
299
  class Iraq_National_ID_front(BaseModel):
203
- """Extract the fields from the OCR extracted text of an Iraqi National ID's front side. Front Side has names, (like father name, mother name etc.), national id numbers but has no dates. Translate wherever required."""
300
+ """Extract the fields from the OCR extracted text of an Iraqi National ID's front side. Front Side has names, (like father name, mother name etc.), national id numbers but has no dates.
301
+ Translate wherever required."""
302
+ ocr_text: str = Field(..., description="Full OCR extracted text from the Iraqi National ID front side image.")
204
303
  first_name: str = Field(..., description="First name (الاسم / ناو) in Arabic.")
205
304
  first_name_en: str = Field(..., description="First name (الاسم / ناو), translated to English.")
206
305
  father_name: str = Field(..., description="Father's name (الأب / باوك) in Arabic.")
207
306
  father_name_en: str = Field(..., description="Father's name (الأب / باوك), translated to English.")
208
307
  third_name: str = Field(..., description="Paternal grandfather's name (الجد / بابير) in Arabic.")
209
308
  third_name_en: str = Field(..., description="Paternal grandfather's name (الجد / بابير), translated to English.")
210
- last_name: Optional[str] = Field(
309
+ last_name: str = Field(
211
310
  "",
212
311
  description=(
213
312
  "Family/tribal name (اللقب / نازناو) in Arabic. "
214
313
  "OCR extracts various versions of 'نازناو' like الزناو, الزنار; do not interpret them as the family name."
215
314
  )
216
315
  )
217
- last_name_en: Optional[str] = Field(
316
+ last_name_en: str = Field(
218
317
  "",
219
318
  description=(
220
319
  "Family/tribal name (اللقب / نازناو), translated to English. "
@@ -225,94 +324,25 @@ class Iraq_National_ID_front(BaseModel):
225
324
  mother_first_name_en: str = Field(..., description="Mother's name (الام/ دابك), translated to English.")
226
325
  mother_last_name: str = Field(..., description="Maternal grandfather's name (الجد / بابير) in Arabic.")
227
326
  mother_last_name_en: str = Field(..., description="Maternal grandfather's name (الجد / بابير), translated to English.")
228
- gender_ar: str = Field(..., description="Gender (الجنس / ردگار): ذكر (male) or أنثى (female).")
229
- gender: str = Field(..., description="Gender (الجنس / ردگار), translated to English")
230
- id_number_front: str = Field(..., description="12-digit national ID number.")
231
- card_number_front: str = Field(..., description="9-character alphanumeric document number.")
232
- serial_number: Optional[str] = Field("", description="6-digit card serial number.")
233
- blood_type: Optional[str] = Field(None, description="Blood type (e.g., O+, A-).")
234
-
235
- @tool(args_schema=Iraq_National_ID_front)
236
- def sanity_check_irq_front(
237
- id_number_front='',
238
- card_number_front='',
239
- first_name='',
240
- first_name_en='',
241
- father_name='',
242
- father_name_en='',
243
- third_name='',
244
- third_name_en='',
245
- last_name='',
246
- last_name_en='',
247
- mother_first_name='',
248
- mother_first_name_en='',
249
- mother_last_name='',
250
- mother_last_name_en='',
251
- gender_ar='',
252
- gender='',
253
- blood_type='',
254
- serial_number=''
255
-
256
- ) -> dict:
257
- print("SANITY CHECK IRQ FRONT WAS CALLED")
258
- """Run sanity checks on the data extracted from Iraq national ID's front side."""
259
- #Post-Processing steps
260
- try:
261
- if not id_number_front.isdigit() or len(id_number_front) != 12:
262
- return {'error': 'invalid_national_number', 'error_details': 'invalid national number, please take a clearer picture of your image. Note: We do not accept Civil Status IDs.'}
263
-
264
- if len(card_number_front) != 9:
265
- return {'error': 'invalid_document_number', 'error_details': 'invalid document number, please take a clearer picture of your image. Note: We do not accept Civil Status IDs.'}
266
-
267
- doc_type = 'national_identity_card'
268
- #at this point, verify_irq_id has run, so we can safely assume the nationality here is IRQ
269
- nationality='IRQ'
270
- nationality_en = 'IRQ'
271
-
272
- optional_fields = ('last_name', 'last_name_en','serial_number','blood_type')
273
- required_fields = {k: v for k, v in locals().items() if k not in optional_fields}
274
-
275
- result_dict = {**locals()}
276
-
277
-
278
-
279
-
280
-
281
-
282
- if not last_name or not last_name_en:
283
- name = result_dict.get('first_name', '') + " " + result_dict.get('father_name', '')
284
- name_en = result_dict.get('first_name_en', '') + " " + result_dict.get('father_name_en', '')
285
- else:
286
- name = result_dict.get('first_name', '') + " " + result_dict.get('father_name', '') + " " + result_dict.get('last_name','')
287
- name_en = result_dict.get('first_name_en', '') + " " + result_dict.get('father_name_en', '')+ " " + result_dict.get("last_name_en",'')
288
-
289
- missing = [key for key, value in required_fields.items() if not str(value).strip()]
290
- if missing:
291
- return {'error': 'covered_photo', 'error_details': f'Missing or empty fields: {", ".join(missing)}'}
292
-
293
- result = {
294
- "error": "",
295
- "error_details": "",
296
- **locals()
297
- }
298
-
299
- if 'required_fields' in result.keys():
300
- del result['required_fields']
301
- if 'missing' in result.keys():
302
- del result['missing']
303
- if 'optional_fields' in result.keys():
304
- del result['optional_fields']
305
- if 'result_dict' in result.keys():
306
- del result['result_dict']
307
- return result
308
-
309
- except Exception as e:
310
- return {'error':'covered_photo','error_details':e}
311
-
312
-
313
-
327
+ gender_ar: str = Field(..., description="Gender (الجنس / ردگار): ذكر (Male) or أنثى (Female).")
328
+ gender: str = Field(..., description="Gender (الجنس / ردگار), translated to English male or female ")
329
+ id_number: str = Field(...,min_length=12, max_length=12, description="12-digit national ID number.Must be exactly 12 digits.")
330
+ card_number: str = Field(...,min_length=9, max_length=9, description="9-character alphanumeric document number.Must be exactly 9 characters.")
331
+ serial_number: str = Field("", min_length = 6, max_length=6, description="6-digit card serial number present vertical, its the last thing present in ocr text extracted.")
332
+ blood_type: str = Field(None, description="Blood type (e.g., O+, A-).")
333
+ header_verified: bool = Field(..., description="whether document is a valid Iraqi National ID's front side."
334
+ "It should strictly contain at least one of the following Arabic/Kurdish texts:"
335
+ " جمهورية العراق / وزارة الداخلية"
336
+ "مديرية الأحوال المدنية والجوازات والاقامة"
337
+ "کوماری عیراق / وه زاره تی ناوخو"
338
+ "پریود به را بائی باری شارستانی و پاسپورت و نیشنگه"
339
+ "جمهورية العراق / وزارة الداخلية"
340
+ "کوماری عیراق / وه زاره تی ناوخو")
341
+
342
+
314
343
  class Iraq_National_ID_back(BaseModel):
315
344
  """Extract only the Arabic fields from the OCR text of an Iraqi National ID's back side. A back side has fields like dates: issue, expiry, birth. Translate where required."""
345
+ ocr_text: str = Field(..., description="Full OCR extracted text from the Iraqi National ID back side image.")
316
346
  issuing_authority: str = Field(..., description="Issuing authority (جهة الاصدار / لايانى ددرجوون) in Arabic")
317
347
  issuing_authority_en: str = Field(..., description="Issuing authority (جهة الاصدار / لايانى ددرجوون), translated to English")
318
348
  issue_date: str = Field(..., description="Date of issue")
@@ -320,644 +350,95 @@ class Iraq_National_ID_back(BaseModel):
320
350
  place_of_birth: str = Field(..., description="Place of birth in Arabic.")
321
351
  place_of_birth_en: str = Field(..., description="Place of birth, translated to English.")
322
352
  dob: str = Field(..., description="Date of birth")
323
- family_number: str = Field(..., description='18-character alphanumeric Family number (الرقم العائلي / ژمارەى خێزانی)')
353
+ family_number: str = Field(..., min_length=18, max_length=18, description='18-character alphanumeric Family number (الرقم العائلي / ژمارەى خێزانی)')
354
+ family_number_en: str = Field(..., min_length=18, max_length=18, description='18-character alphanumeric Family number same as family number (الرقم العائلي / ژمارەى خێزانی)')
324
355
  mrz1: str = Field(...,description="MRZ Line 1: Includes document type (ID), issuing country code (IRQ), document number, and check digits. Example: 'IDIRQAL36266736200026108063<<<'")
325
356
  mrz2: str = Field(...,description="MRZ Line 2: Encodes date of birth (YYMMDD), gender (M/F), expiry date (YYMMDD), and nationality code (IRQ) and check digit at the end of '<<<<<<'. Example: '0007191M2811280IRQ<<<<<<<<<<<7'")
326
357
  mrz3: str = Field(...,description="MRZ Line 3: Contains surname and given name(s), separated by '<<'. Given names may include multiple parts separated by '<'. If no surname is present, it starts with '<<'. Example: 'AHMED<<ALI<HASSAN' or '<<ALI'")
358
+ gender_mrz: str = Field(...,description="Gender extracted from MRZ line 2: 'M' for Male, 'F' for female.")
359
+ expiry_date_mrz: str = Field(...,description="Expiry date extracted from MRZ line 2 in DD/MM/YYYY format.")
360
+ dob_mrz: str = Field(...,description="Date of birth as extracted from MRZ (in DD/MM/YYYY format) first six characters of mrz2")
327
361
  last_name_back: str = Field(...,description="Surname extracted from MRZ line 3, before the '<<' separator.")
328
362
  first_name_back: str = Field(...,description="Given name extracted from MRZ line 3, after the '<<' seperator.")
363
+ header_verified: bool = Field(..., description="if header contains in mrz1 'IDIRQ' then true else false")
364
+ card_number_mrz: str = Field(..., min_length = 9, max_length=9, description="Document number as extracted from MRZ line 1")
329
365
 
330
366
 
331
- @tool(args_schema=Iraq_National_ID_back)
332
- def sanity_check_irq_back(
333
- issuing_authority='',
334
- issuing_authority_en='',
335
- issue_date='',
336
- expiry_date='',
337
- place_of_birth='',
338
- place_of_birth_en='',
339
- dob='', mrz1='', mrz2='', mrz3='',
340
- last_name_back='',
341
- first_name_back='',
342
- family_number=''
343
- ):
344
- try:
345
- #===========Post-Processing==============
346
- print("SANITY CHECK IRQ BACK WAS CALLED")
347
- """Run sanity checks on the data extracted from Iraq national ID's back side."""
348
- doc_type = 'national_identity_card'
349
-
350
- family_number = sanity_utils.fix_family_number(family_number)
351
-
352
- family_number_en = family_number
353
-
354
- #At this point, verify_irq_id has been run, so we can safely say its an Iraqi ID.
355
- nationality='IRQ'
356
- issuing_country='IRQ'
357
-
358
- if mrz1:
359
- card_number = mrz1.strip()[5:14]
360
- card_number_back = mrz1.strip()[5:15]
361
- id_number = mrz1.strip()[15:27]
362
- mrz = [mrz1 + mrz2 + mrz3]
363
-
364
- else:
365
- return {'error':'covered_photo', 'error_details':'cropped_mrz'}
366
-
367
-
368
-
369
- #==============Sanity checks for blur detection and/or cropped image
370
- valid_expiry_issue = sanity_utils.is_expiry_issue_diff_valid(issue_date,expiry_date, 10)
371
- age_check = sanity_utils.is_age_18_above(dob)
372
- dob_match_mrz_dob = sanity_utils.is_mrz_dob_mrz_field_match(dob, mrz2)
373
367
 
374
- is_doc_expired = sanity_utils.is_expired_id(expiry_date)
375
368
 
376
- if is_doc_expired:
377
- return {"error":"expired_id", 'error_details':'expired ID'}
378
-
379
- if mrz2:
380
- gender_back = sanity_utils.find_gender_from_back(mrz2.strip())
381
- else:
382
- gender_back=''
369
+ def process_image_irq(side):
370
+ if side == "front":
371
+ prompt = PROMPT_FRONT_IRQ
372
+ model = Iraq_National_ID_front
383
373
 
374
+ elif side == "back":
375
+ prompt = PROMPT_BACK_IRQ
376
+ model = Iraq_National_ID_back
384
377
 
378
+ elif side == "page1":
379
+ prompt = PROMPT_PASSPORT_IRQ
380
+ model = IraqiPassport
381
+ else:
382
+ raise ValueError("Invalid document side specified. Use 'front', 'back', or 'passport'.")
385
383
 
386
- if not (all([valid_expiry_issue, age_check, dob_match_mrz_dob])):
387
- return {'error':'covered_photo', 'error_details':'blur or cropped or low-quality image'}
384
+ return model, prompt
388
385
 
389
386
 
390
- #Check required fields
391
- optional_fields = ('last_name_back','first_name_back')
392
- required_fields = {k: v for k, v in locals().items() if k not in optional_fields}
393
387
 
394
- missing = [key for key, value in required_fields.items() if not str(value).strip()]
395
- if missing:
396
- return {
397
- 'error': 'covered_photo',
398
- 'error_details': f'Missing or empty fields: {", ".join(missing)}'
399
- }
388
+ def get_openai_response_irq(prompt: str, model_type, image: str, genai_key):
389
+ # covert byte io to utf-8 string
390
+
391
+ for attempt in range(3):
400
392
  try:
401
- dob = sanity_utils.convert_dob_to_standard(dob)
402
- expiry_date = sanity_utils.convert_dob_to_standard(expiry_date)
393
+ client = OpenAI(api_key=genai_key)
394
+ response = client.responses.parse(
395
+ model="gpt-4.1-mini",
396
+ input=[
397
+ {
398
+ "role": "system",
399
+ "content": "You are an expert at extracting information from identity documents.",
400
+ },
401
+ {
402
+ "role": "user",
403
+ "content": [
404
+ {"type": "input_text", "text": prompt},
405
+ {
406
+ "type": "input_image",
407
+ "image_url": f"data:image/jpeg;base64,{image}",
408
+ "detail": "high",
409
+ },
410
+ ],
411
+ },
412
+ ],
413
+ text_format=model_type,
414
+ )
415
+ return response.output_parsed
403
416
  except Exception as e:
404
- return {
405
- 'error': 'covered_photo',
406
- 'error_details': f'Exception Thrown while parsing dates: {e}'
407
- }
408
-
409
-
410
- result = {
411
- "error": "",
412
- "error_details": "",
413
- **locals()
414
- }
415
-
416
- if 'required_fields' in result.keys():
417
- del result['required_fields']
418
- if 'missing' in result.keys():
419
- del result['missing']
420
- if 'optional_fields' in result.keys():
421
- del result['optional_fields']
422
- return result
423
- except Exception as e:
424
- return {'error':'covered_photo','error_details':e}
417
+ logging.info(f"[ERROR] Attempt {attempt + 1} failed: {str(e)}")
418
+ time.sleep(2)
419
+ return None
425
420
 
426
421
 
427
422
 
428
- def route(result):
429
- if isinstance(result, AgentFinish):
430
- return {'error': 'covered_photo', 'error_details': result.return_values['output']}
431
- else:
432
- tools = {
433
- "sanity_check_irq_back": sanity_check_irq_back,
434
- "sanity_check_irq_front": sanity_check_irq_front,
435
- "sanity_check_irq_passport": sanity_check_irq_passport
436
- }
437
- return tools[result.tool].run(result.tool_input)
438
-
439
- def route_verification(result):
440
- if isinstance(result,AgentFinish):
441
- return ''
442
- else:
443
- return result.tool_input
444
-
445
- def extraction_chain(ocr_text, openai_key, side = ''):
423
+ def get_response_from_openai_irq(image, side, openai_key):
424
+ logging.info(f"Processing Iraqi document side: {side}")
425
+ image_bytes = base64.b64encode(image).decode('utf-8')
426
+ try:
427
+ model, prompt = process_image_irq(side)
428
+ except ValueError as ve:
429
+ logging.error(f"Error: {ve}")
430
+ return {"error": str(ve)}
431
+ logging.info(f"Using prompt for side and prompt {side} selected.{prompt[:50]}...")
446
432
  try:
447
- gpt_model = 'gpt-4o'
448
- print("WE ARE IN EXTRACTION CHAIN")
449
- tools_func = [sanity_check_irq_back, sanity_check_irq_front, sanity_check_irq_passport]
450
-
451
- model = ChatOpenAI(model=gpt_model, temperature=0,
452
- openai_api_key=openai_key)
453
- extraction_functions = [format_tool_to_openai_function(f) for f in tools_func]
454
- extraction_model = model.bind(functions=extraction_functions)
455
-
456
- prompt = ChatPromptTemplate.from_messages([
457
- ("system",
458
- "Extract the relevant information, if not explicitly provided do not guess, leave empty string. Extract partial info. Translate values wherever it is required."
459
- ),
460
- ("user", "{ocr_text}")
461
- ])
462
-
463
- prompt_verify_doc = ChatPromptTemplate.from_messages([
464
- ("system", "Verify the relevant document."
465
- ),
466
- ("user", "{ocr_text}")
467
- ])
468
-
469
- model_verification = ChatOpenAI(model=gpt_model, temperature=0,
470
- openai_api_key=openai_key)
471
- verification_function = [convert_pydantic_to_openai_function(Verify_IRQ_ID), convert_pydantic_to_openai_function(Verify_IRQ_Passport)]
472
- verification_model = model_verification.bind(functions=verification_function)
473
- verification_chain = prompt_verify_doc | verification_model | OpenAIFunctionsAgentOutputParser() | route_verification
474
- st = time.time()
475
- verification_model_result = verification_chain.invoke({"ocr_text":ocr_text})
476
- logging.info(f'----------------Time taken for Verification Chain: {time.time() - st} seconds\n')
477
- if verification_model_result == '':
478
- if side=='front':
479
- return {'error':f'not_front_id'}, ''
480
- if side=='back':
481
- return {'error':f'not_back_id'}, ''
482
- if side=='page1':
483
- return {'error': f'not_passport'}, ''
484
- else:
485
- return {'error':'covered_photo'}
486
- else:
487
- is_valid_id = verification_model_result.get("is_valid_id","")
488
-
489
-
490
- if verification_model_result.get("side","")=='passport':
491
- side_predicted='page1'
492
-
493
- else:
494
- side_predicted = verification_model_result.get("side","")
495
- print("Side Predicted:", side_predicted)
496
-
497
-
498
-
499
- if is_valid_id=="True" and side==side_predicted:
500
- max_retries = 2
501
- st = time.time()
502
- for attempt in range(max_retries+1):
503
- extraction_chain = prompt | extraction_model | OpenAIFunctionsAgentOutputParser() | route
504
- data = extraction_chain.invoke({"ocr_text": ocr_text})
505
-
506
- if data.get('error')=='':
507
- return data, side_predicted
508
- if data.get('error')!='' and attempt>=max_retries:
509
- return data, side_predicted
510
- elif data.get('error')!='' and attempt<max_retries:
511
- print("RETRYING")
512
- time.sleep(2)
513
- continue
514
- #Only for testing purpose, comment out when pushing to production.
515
- # if is_valid_id=="True" and side=='auto':
516
- # max_retries = 2
517
- # for attempt in range(max_retries+1):
518
- # extraction_chain = prompt | extraction_model | OpenAIFunctionsAgentOutputParser() | route
519
- # data = extraction_chain.invoke({"ocr_text": ocr_text})
520
-
521
- if data.get('error')=='':
522
- return data, side_predicted
523
- if data.get('error')!='' and attempt>=max_retries:
524
- return data, side_predicted
525
- elif data.get('error')!='' and attempt<max_retries:
526
- print("RETRYING")
527
- time.sleep(2)
528
- continue
529
- logging.info(f'----------------Time taken for Extraction Chain: {time.time() - st} seconds\n')
530
-
531
- else:
532
- if side=='' or side=='auto':
533
- side = side_predicted
534
- error = f"not_{side}_id"
535
- return {'error':error}, side
536
- if side=='front' or side=='back':
537
- return {'error':f'not_{side}_id'}, side
538
- elif side=='page1':
539
- return {'error':'not_passport'}, side
540
-
541
- except ValidationError as e:
542
- errors = e.errors() # list of error dicts
543
- # Extract all messages
544
- error = [error['msg'] for error in errors]
545
- return {'error':error[0], 'error_details': 'cropped mrz'},''
433
+ start_time = time.time()
434
+ response = get_openai_response_irq(prompt, model,image_bytes, openai_key)
435
+ elapsed_time = time.time() - start_time
436
+ logging.info(f"OpenAI extraction took {elapsed_time:.2f} seconds")
546
437
  except Exception as e:
547
- return {'error':'bad_image', 'error_details':e}, ''
548
-
549
- # from idvpackage.llm_ocr import llm_ocr_extraction
550
-
551
- # def ocr_and_extraction(base_64_image, openai_key, side):
552
- # openai.api_key = openai_key
553
- # ocr_text = llm_ocr_extraction(base_64_image)
554
- # result,side = extraction_chain(ocr_text, openai_key,side)
555
- # return ocr_text,result,side
556
-
557
-
558
- # use response.pareser method to get the side
559
-
560
-
561
- # PROMPT_IDENTIFY_IRQ_SIDE = """You are given OCR text extracted from an identity document. Produce a single JSON object that matches this Pydantic model exactly:
562
-
563
- # IdentifySideResponse:
564
- # - is_valid_id: "True" or "" (empty string)
565
- # - side: "front", "back", or "" (empty string)
566
-
567
- # Decision rules:
568
- # - Set is_valid_id = "True" if the OCR clearly belongs to an Iraqi National ID (contains Arabic/Kurdish phrases such as "جمهورية العراق", "وزارة الداخلية", "مديرية الأحوال المدنية", Kurdish equivalents, or clear ID structure like MRZ, DOB/issue/expiry dates, or a plausible Iraqi ID number). Otherwise set "".
569
- # - Determine side:
570
- # - "back" if OCR includes MRZ (three MRZ lines or MRZ-like patterns with '<'), or contains dates (DOB/issue/expiry) or MRZ-style date fields.
571
- # - "front" if OCR contains personal name fields, ID number, national symbols/text but no dates or MRZ.
572
- # - "" if you cannot confidently classify.
573
- # - Use exact string values ("True", "", "front", "back") and nothing else.
574
- # - Output only the JSON object (no explanation, no extra keys, no surrounding text).
575
-
576
- # Examples:
577
- # Input OCR:
578
- # "@@@\nI<IRQ<<<<DOE<<JOHN<<<<<<<<<<<<\n123456789IRQ\nDOB 01/02/1990\nEXP 01/02/2030\nوزارة الداخلية\n"
579
- # Output:
580
- # {"is_valid_id":"True","side":"back"}
581
-
582
- # Input OCR:
583
- # "جمهورية العراق\nالاسم: محمد احمد\nالرقم الوطني: 123456789\n"
584
- # Output:
585
- # {"is_valid_id":"True","side":"front"}
586
-
587
- # If uncertain about validity or side, prefer empty strings rather than guessing. Return only JSON object"""
438
+ logging.error(f"Error during OCR extraction: {e}")
439
+ return {"error": "OCR extraction failed."}
440
+ response_data = response.dict() if response else {}
441
+
442
+ logging.info(f"Openai response: {json.dumps(response_data, ensure_ascii=False, indent=2)}")
443
+ return response_data
588
444
 
589
- # PROMPT_FRONT_IRQ = """
590
- # You are an expert in reading Iraqi National ID Cards. Extract the following fields from the **front side** of the ID image.
591
- # OUTPUT FORMAT
592
- # - Return a single JSON object and nothing else.
593
- # - Use exactly these keys (string values) in this exact set: first_name, first_name_en, father_name, father_name_en, third_name, third_name_en, last_name, last_name_en, mother_first_name, mother_first_name_en, mother_last_name, mother_last_name_en, gender_ar, gender, id_number, card_number, serial_number, blood_type.
594
- # - For any field you cannot read or that is not present, return an empty string "".
595
- # - Do NOT include extra keys, comments, or explanatory text.
596
-
597
- # PREFERRED EXTRACTION ORDER (must follow this order when resolving ambiguous or multiple name-like values)
598
- # 1. name: first_name, first_name_en
599
- # 2. father's name: father_name, father_name_en
600
- # 3. paternal grandfather name / third name: third_name, third_name_en
601
- # 4. family/tribal name / last name: last_name, last_name_en
602
- # 5. mother's name (given / "bidah"): mother_first_name, mother_first_name_en
603
- # 6. mother's last name: mother_last_name, mother_last_name_en
604
- # 7. gender: gender_ar then gender
605
- # 8. blood type: blood_type
606
-
607
- # FIELD EXTRACTION RULES (high-precision)
608
- # 1. General:
609
- # - Prefer the text that is printed directly under or next to the label on the FRONT side. If multiple languages appear, store the Arabic exact text in *_ar fields and the English/transliterated text in *_en fields.
610
- # - When multiple candidate name-like values exist, choose following the PREFERRED EXTRACTION ORDER above. Do NOT swap order or assign the paternal-grandfather value to the father's slot, etc.
611
- # - Preserve characters exactly as printed for Arabic fields; do not normalize or transliterate Arabic into Latin unless placed into a *_en field.
612
- # - Do NOT hallucinate, infer, or guess missing values. If unclear, return "".
613
-
614
- # 2. Names:
615
- # - first_name (Arabic): the given name exactly as printed in Arabic on the front.
616
- # - first_name_en: the same given name transliterated to English (Latin script) exactly as printed or transliterated from Arabic; preserve casing and spaces.
617
- # - father_name / father_name_en, third_name / third_name_en follow same rules for father and paternal-grandfather.
618
- # - last_name / last_name_en: family/tribal name if present. If not present, return "" for both.
619
- # - mother_first_name / mother_first_name_en and mother_last_name / mother_last_name_en: extract mother's given and last names similarly.
620
-
621
- # 3. Gender:
622
- # - gender_ar: return the Arabic text exactly as printed (e.g., "ذكر" or "أنثى").
623
- # - gender: map to English "male" or "female" (lowercase). If ambiguous, return "".
624
-
625
- # 4. Identification numbers:
626
- # - id_number: must be exactly the 12 digits printed on the card (do not alter digits, do not insert spaces or separators). If not exactly 12 digits, return "".
627
- # - card_number: exact 9-character document number as printed (preserve letters/digits).
628
- # - serial_number: optional 6-character serial if present; else "".
629
- # - blood_type: optional (e.g., "O+", "A-"); return exactly as printed or "".
630
-
631
- # 5. Formatting & validation:
632
- # - Trim surrounding whitespace but do not change internal spacing, punctuation, or letter case for name fields.
633
- # - If both Arabic and English appear for a name under the same label, assign Arabic text to the *_ar field and English/transliteration to the *_en field.
634
-
635
- # 4. Do NOT guess or hallucinate any values. If unclear, return empty string.
636
-
637
- # 5. Return structured JSON output as per schema only.
638
- # """
639
-
640
- # PROMPT_BACK_IRQ = """
641
- # You are an expert in reading Iraqi National ID Cards. Extract the following fields from the **back side** of the ID image.
642
-
643
- # 1. **Extract MRZ lines (Machine Readable Zone):**
644
- # - Each line must be exactly 30 characters.
645
- # - Return as a list of exactly 3 strings (`mrz`), in order.
646
- # - Keep each line exactly as printed (no padding, no fixing).
647
- # - Remove all whitespace and punctuation.
648
- # - Return exact number of '<' characters in each line of mrz.
649
-
650
- # 2. **Verify IDIRQ prefix:**
651
- # - If the first line of MRZ starts with 'IDIRQ', return `idirq_verified` as true. Otherwise, false.
652
-
653
-
654
- # 3. **Extract and format these fields:**
655
- # - `dob_back`, `issue_date`, `expiry_date` in **DD/MM/YYYY** format.
656
-
657
- # 4. **Extract issuing authority:**
658
- # - `issuing_authority_ar`: Issuing authority (جهة الاصدار / لايانى ددرجوون) in Arabic, exactly as printed.
659
- # - `'issuing_authority_en'`: TRANSLATED name of the issuing authority (`issuing_authority_ar`) in English.
660
-
661
- # 5. **Extract place of birth:**
662
- # - `place_of_birth_ar`: Place of birth in Arabic as printed on the back
663
- # - `place_of_birth_en`: Transliterated place of birth (`place_of_birth_ar) into English
664
-
665
- # 6. **Extract Names**
666
- # - `first_name_back`: First name extracted from MRZ line 3, after the '<<' seperator."
667
- # - `last_name_back`: Surname extracted from MRZ line 3, before the '<<' separator." If this is not present, return null.
668
-
669
- # 7. **Extract Family Number:**
670
- # - `family_number`: 18-character alphanumeric Family number (الرقم العائلي / ژمارەى خێزانی)' exactly as printed (do not alter).
671
-
672
- # 8. **Extract Nationality:**
673
- # - `nationality`: 3-letter ISO nationality code, (e.g. IRQ for Iraq).
674
-
675
- # 8. **DO NOT GUESS.**
676
- # - If a field is faint, blurry, or unclear, return empty string.
677
-
678
- # 9. Return output as JSON according to the defined schema.
679
- # """
680
-
681
-
682
- # PROMPT_PASSPORT_IRQ = """
683
- # Extract ALL fields from this Iraqi Passport image with high accuracy.
684
-
685
- # 1. Extract name English:
686
- # - `full_name`: Full Name, in English, exactly as printed
687
- # - Do not add anything from the field 'Surname' into this.
688
- # - `last_name`: Surname, in English, exactly as printed
689
- # - `mother_name`: Mother's full name in English, exactly as printed
690
-
691
- # 2. **Extract place of birth:**
692
- # - If value of Place of Birth is in English, return exactly as printed.
693
- # - else, if it is not in English, look at the right-hand side of the passport, where it says "Place of Birth"
694
- # - Transliterate this place of birth into English, if it is only in Arabic.
695
- # 3. Parse and extract:
696
- # - `issuing_authority`, exactly as printed in English.
697
- # - Transliterate `issuing_authority` to English if it is only in Arabic.
698
- # - `issuing_country`: country of issuance or country code, exactly as printed in English.
699
- # - `gender`: Gender/Sex either as Male or Female
700
- # - `dob`, `issue_date`, `expiry_date` → all in DD/MM/YYYY format
701
- # - `id_number`: must be 9-character alphanumeric passport number.
702
- # - `nationality`: use 3-letter ISO format (e.g., IRQ for Iraq, JOR for Jordan)
703
-
704
- # 4. If only two locations are visible, assign the first to place_of_birth and second to issuing_authority.
705
-
706
- # 5. Ensure that the fields `mrz1` and `mrz2` strictly follow the below format for passports:
707
-
708
- # - Both `mrz1` and `mrz2` must be exactly 44 characters long.
709
- # - Use the `<` symbol for padding, **not spaces or any other characters**.
710
- # - There should be **no commas, no spaces**, and only uppercase English alphabets, digits, and `<` characters are allowed.
711
- # - If the line is shorter than 44 characters, pad it **only with `<` symbols at the end**, **except**:
712
- # - In `mrz2`, the final character is a **check digit** (usually numeric) and must remain the last character. Padding with `<` should be applied **before** this digit.
713
- # - Do not introduce extra characters to make the string 44 characters. Do not insert `<` between letters or numbers — only at the end (or just before the check digit in `mrz2`).
714
- # - Do not append any punctuation like commas, periods, or symbols.
715
-
716
- # Return the lines exactly as shown, with **no trailing whitespace** or formatting.
717
-
718
- # 5. Do not guess or invent any value. If a field is unclear or missing, return empty string.
719
-
720
- # 6. Output MUST be a structured JSON following the defined schema.
721
- # """
722
-
723
- # class IraqiIDCardFront(BaseModel):
724
- # first_name: str = Field(..., description="First name (الاسم / ناو) in Arabic.")
725
- # first_name_en: str = Field(..., description="Transliterate First name (الاسم / ناو), to English.")
726
- # father_name: str = Field(..., description="Father's name (الأب / باوك) in Arabic.")
727
- # father_name_en: str = Field(..., description="Transliterate Father's name (الأب / باوك) to English.")
728
- # third_name: str = Field(..., description="Paternal grandfather's name (الجد / بابير) in Arabic.")
729
- # third_name_en: str = Field(..., description="Transliterate Paternal grandfather's name (الجد / بابير) to English.")
730
- # last_name: Optional[str] = Field(
731
- # "",
732
- # description=(
733
- # "Family/tribal name (اللقب / نازناو) in Arabic. "
734
- # "OCR extracts various versions of 'نازناو' like الزناو, الزنار; do not interpret them as the family name."
735
- # )
736
- # )
737
- # last_name_en: Optional[str] = Field(
738
- # "",
739
- # description=(
740
- # "Transliterate Family/tribal name (اللقب / نازناو) to English. "
741
- # "OCR extracts various versions of 'نازناو' like الزناو, الزنار; do not interpret them as the family name."
742
- # )
743
- # )
744
- # mother_first_name: str = Field(..., description="Mother's name (الام/ دابك) in Arabic.")
745
- # mother_first_name_en: str = Field(..., description="Transliterate Mother's name (الام/ دابك) to English.")
746
- # mother_last_name: str = Field(..., description="Maternal grandfather's name (الجد / بابير) in Arabic.")
747
- # mother_last_name_en: str = Field(...,
748
- # description="Transliterate Maternal grandfather's name (الجد / بابير) to English.")
749
- # gender_ar: str = Field(..., description="Gender (الجنس / ردگار): ذكر (male) or أنثى (female).")
750
- # gender: str = Field(..., description="Translate Gender (الجنس / ردگار) to English")
751
- # id_number: str = Field(..., description="12-digit national ID number.")
752
- # card_number: str = Field(..., description="9-character alphanumeric document number.")
753
- # serial_number: Optional[str] = Field("", description="6-digit card serial number.")
754
- # blood_type: Optional[str] = Field(None, description="Blood type (e.g., O+, A-).")
755
-
756
-
757
-
758
- # class IraqiIDCardBack(BaseModel):
759
- # issuing_authority_ar: str = Field(..., description="Issuing authority (جهة الاصدار / لايانى ددرجوون) in Arabic")
760
- # issuing_authority_en: str = Field(..., description="TRANSLATE Issuing authority into English")
761
- # issue_date: str = Field(..., description="Issue date in DD/MM/YYYY format")
762
- # expiry_date: str = Field(..., description="Expiry date in DD/MM/YYYY format")
763
- # place_of_birth_ar: str = Field(..., description="Place of birth in Arabic.")
764
- # place_of_birth_en: str = Field(..., description="Transliterated Place of birth into English.")
765
- # dob: str = Field(..., description="Date of birth in DD/MM/YYYY format")
766
- # family_number: str = Field(...,
767
- # description='18-character alphanumeric Family number (الرقم العائلي / ژمارەى خێزانی) exactly as printed (do not alter).')
768
- # mrz: List[str] = Field(..., min_items=3, max_items=3,
769
- # description="List of 3 MRZ lines. Each line must be exactly as printed on the ID (30 characters, unaltered).")
770
- # first_name_back: str = Field(..., description="Given name extracted from MRZ line 3, after the '<<' seperator.")
771
- # last_name_back: Optional[str] = Field(...,
772
- # description="Surname extracted from MRZ line 3, before the '<<' separator. If this is not present, return null.")
773
- # idirq_verified: bool = Field(..., description="True if the first MRZ line starts with 'IDIRQ'")
774
- # nationality: str = Field(..., description="3-letter nationality code (e.g., IRQ for Iraq)")
775
-
776
- # @validator("idirq_verified", always=True)
777
- # def check_idirq(cls, v, values):
778
- # mrz = values.get("mrz", [])
779
- # return bool(mrz and mrz[0].startswith("IDIRQ"))
780
-
781
-
782
- # class IraqiPassport(BaseModel):
783
- # full_name: str = Field(..., description="The Full Name, in English, exactly as printed on the document")
784
- # last_name: str = Field(..., description="Surname of the person on the passport")
785
- # place_of_birth: str = Field(..., description=("If Place of Birth is in English, return exactly as printed."
786
- # "If not present in English, look at the right-hand side of the passport, where it says 'Place of Birth'."
787
- # "Transliterate to English if value of Place of Birth is only in Arabic"))
788
-
789
- # issuing_authority: str = Field(..., description=("Place of passport issuance in English"
790
- # "Transliterate to English if issuing authority is only in Arabic"))
791
- # issuing_country: str = Field(..., description="Issuing Country/Country Code (e.g. 'IRQ', 'JOR')", example='IRQ')
792
- # mother_name: str = Field(..., description="Mother's full name in English, exactly as printed.")
793
- # gender: str = Field(..., description="printed as Sex: M or F return 'Male' or 'Female' accordingly")
794
- # mrz1: str = Field(..., min_length=44, max_length=44,
795
- # description="First line of the MRZ, exactly 44 characters, padded with '<' at the end if shorter")
796
- # mrz2: str = Field(..., min_length=44, max_length=44,
797
- # description="Second line of the MRZ, exactly 44 characters. Padding with '<' must be inserted before the final check digit.")
798
- # id_number: str = Field(..., pattern=r"^[A-Z][0-9]{8}$",
799
- # description="Passport number: one uppercase letter followed by 8 digits")
800
-
801
- # dob: str = Field(
802
- # ..., description="Date of birth in DD/MM/YYYY format"
803
- # )
804
- # issue_date: str = Field(
805
- # ..., description="Issue date in DD/MM/YYYY format"
806
- # )
807
- # expiry_date: str = Field(
808
- # ..., description="Expiry date in DD/MM/YYYY format"
809
- # )
810
- # nationality: str = Field(
811
- # ..., description="Nationality in ISO 3166-1 alpha-3 format (e.g., SDN)"
812
- # )
813
-
814
- # header_verified: bool = Field(
815
- # ..., description="True if document header ('IRQ', 'Republic of Iraq') is detected"
816
- # )
817
-
818
- # class IdentifyIRQSideResponse(BaseModel):
819
- # is_valid_id: bool = Field(..., description="Return True if document is either a valid Iraqi National ID's front side or back side."
820
- # "It should contain Arabic/Kurdish text like: جمهورية العراق / وزارة الداخلية"
821
- # "مديرية الأحوال المدنية والجوازات والاقامة"
822
- # "کوماری عیراق / وه زاره تی ناوخو"
823
- # "پریود به را بائی باری شارستانی و پاسپورت و نیشنگه"
824
- # "جمهورية العراق / وزارة الداخلية"
825
- # "کوماری عیراق / وه زاره تی ناوخو"
826
- # "Return empty string '' otherwise.")
827
- # # side should be one of the front, back or empty string
828
- # side: str = Field(..., description="Determine if this is a front side or back side of an Iraqi National ID. Return empty string if its neither."
829
- # "A back side has three lines of MRZ, has dates of birth, issue and expiry"
830
- # "A front side has names, and id number. No dates. return front or back accordingly.")
831
-
832
-
833
-
834
- # def _image_to_jpeg_bytesio(image) -> BytesIO:
835
- # """
836
- # Accepts: numpy.ndarray (OpenCV BGR), PIL.Image.Image, bytes/bytearray, or io.BytesIO
837
- # Returns: io.BytesIO containing JPEG bytes (ready for get_openai_response)
838
- # """
839
- # import numpy as np
840
-
841
- # if isinstance(image, BytesIO):
842
- # image.seek(0)
843
- # return image
844
-
845
- # if isinstance(image, (bytes, bytearray)):
846
- # return BytesIO(image)
847
-
848
- # try:
849
- # from PIL.Image import Image as _PILImage
850
-
851
- # if isinstance(image, _PILImage):
852
- # buf = BytesIO()
853
- # image.convert("RGB").save(buf, format="JPEG", quality=95)
854
- # buf.seek(0)
855
- # return buf
856
- # except Exception:
857
- # pass
858
-
859
- # if isinstance(image, np.ndarray):
860
- # success, enc = cv2.imencode(".jpg", image)
861
- # if not success:
862
- # raise ValueError("cv2.imencode failed")
863
- # return BytesIO(enc.tobytes())
864
-
865
- # raise TypeError(
866
- # "Unsupported image type. Provide numpy.ndarray, PIL.Image.Image, bytes, or io.BytesIO."
867
- # )
868
-
869
- # def get_irq_side_from_openai(image, openai_key):
870
-
871
- # logging.info(f"Getting side of Iraqi ID from OpenAI... and type of image {type(image)}")
872
- # base_64_image = _image_to_jpeg_bytesio(image)
873
- # b64_image = base64.b64encode(base_64_image.getvalue()).decode("utf-8")
874
-
875
- # logging.info(f"Converted image to JPEG BytesIO for OpenAI processing. type of base_64_image {type(b64_image)}")
876
- # for attempt in range(3):
877
- # try:
878
- # client = OpenAI(api_key=openai_key)
879
- # # image_data = base64.b64decode(b64_image)
880
- # response = client.responses.parse(
881
- # model="gpt-4.1-mini",
882
- # input = [{"role": "system", "content": "You are an expert at extracting information from identity documents, extract data as per fields, dont use any additional text or infer from mrz data."},
883
- # {"role": "user", "content": [
884
- # {"type": "input_text", "text": PROMPT_IDENTIFY_IRQ_SIDE},
885
- # {"type": "input_image", "image_url": f"data:image/jpeg;base64,{b64_image}", "detail": "low"},
886
- # ]},
887
- # ],
888
- # text_format = IdentifyIRQSideResponse
889
- # )
890
- # logging.info(f"Received response from OpenAI for side identification., {response.output_parsed}")
891
-
892
- # return vars(response.output_parsed)
893
-
894
- # except Exception as e:
895
- # logging.error(f"Error in get_side_from_openAI attempt {attempt + 1}: {e}")
896
- # time.sleep(2)
897
- # return {"is_valid_id": "", "side": ""}, b64_image
898
-
899
- # def get_openai_response_irq(prompt: str, model_type, image: BytesIO, genai_key):
900
-
901
- # for attempt in range(3):
902
- # try:
903
- # client = OpenAI(api_key=genai_key)
904
- # response = client.responses.parse(
905
- # model="gpt-4.1-mini",
906
- # input=[
907
- # {"role": "system",
908
- # "content": "You are an expert at extracting information from identity documents."},
909
- # {"role": "user", "content": [
910
- # {"type": "input_text", "text": prompt},
911
- # {"type": "input_image", "image_url": f"data:image/jpeg;base64,{image}", "detail": "low"},
912
- # ]},
913
- # ],
914
- # text_format=model_type,
915
- # )
916
- # return response.output_parsed
917
- # except Exception as e:
918
- # logging.info(f"[ERROR] Attempt {attempt + 1} failed: {str(e)}")
919
- # time.sleep(2)
920
- # return None
921
-
922
- # def process_image_irq(side):
923
- # if side == "front":
924
- # prompt = PROMPT_FRONT_IRQ
925
- # model = IraqiIDCardFront
926
-
927
- # elif side == "back":
928
- # prompt = PROMPT_BACK_IRQ
929
- # model = IraqiIDCardBack
930
-
931
- # elif side == "passport":
932
- # prompt = PROMPT_PASSPORT_IRQ
933
- # model = IraqiPassport
934
- # else:
935
- # raise ValueError("Invalid document side specified. Use 'front', 'back', or 'passport'.")
936
-
937
- # return model, prompt
938
-
939
- # def get_response_from_openai_irq(image, side, openai_key):
940
- # logging.info(f"Getting response from OpenAI for Iraqi Id side {side}... and type of image {type(image)}")
941
- # try:
942
- # base_64_image = _image_to_jpeg_bytesio(image)
943
- # b64_image = base64.b64encode(base_64_image.getvalue()).decode("utf-8")
944
- # logging.info(f"Converted image to JPEG BytesIO for OpenAI processing. type of base_64_image {type(b64_image)}")
945
- # except Exception as e:
946
- # logging.error(f"Error converting image: {e}")
947
- # return {"error": "Image conversion failed"}
948
- # try:
949
- # model, prompt = process_image_irq(side)
950
- # logging.info(f"Using model: {model.__name__} and prompt {prompt[:100]}")
951
- # except ValueError as ve:
952
- # logging.error(f"Error: {ve}")
953
- # return {"error": str(ve)}
954
-
955
- # try:
956
- # response = get_openai_response_irq(prompt, model, b64_image, openai_key)
957
- # except Exception as e:
958
- # logging.error(f"Error during OpenAI request: {e}")
959
- # return {"error": "OpenAI request failed"}
960
-
961
- # response_data = vars(response)
962
- # logging.info(f"Openai response: {response}")
963
- # return response_data