idvpackage 3.0.11__py3-none-any.whl → 3.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- idvpackage/common.py +8 -966
- idvpackage/iraq_id_extraction_withopenai.py +374 -893
- idvpackage/jor_passport_extraction.py +1 -6
- idvpackage/liveness_spoofing_v2.py +2 -45
- idvpackage/ocr.py +1016 -2430
- idvpackage/ocr_utils.py +148 -489
- idvpackage/pse_passport_extraction.py +18 -292
- idvpackage/qatar_id_extraction.py +4 -956
- idvpackage/sudan_passport_extraction.py +0 -928
- idvpackage/syr_passport_extraction.py +27 -402
- idvpackage/uae_id_extraction.py +87 -151
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/METADATA +1 -1
- idvpackage-3.0.13.dist-info/RECORD +34 -0
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/WHEEL +1 -1
- idvpackage/ekyc.py +0 -78
- idvpackage/genai_utils.py +0 -309
- idvpackage/iraq_id_extraction.py +0 -992
- idvpackage/iraq_passport_extraction.py +0 -588
- idvpackage/lazy_imports.py +0 -44
- idvpackage/lebanon_passport_extraction.py +0 -161
- idvpackage/sau_id_extraction.py +0 -248
- idvpackage/sudan_id_extraction.py +0 -764
- idvpackage-3.0.11.dist-info/RECORD +0 -42
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/licenses/LICENSE +0 -0
- {idvpackage-3.0.11.dist-info → idvpackage-3.0.13.dist-info}/top_level.txt +0 -0
|
@@ -1,161 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import time
|
|
3
|
-
import openai
|
|
4
|
-
import re
|
|
5
|
-
import logging
|
|
6
|
-
import json
|
|
7
|
-
import time
|
|
8
|
-
import datetime
|
|
9
|
-
import openai
|
|
10
|
-
from langchain.tools import tool
|
|
11
|
-
from langchain.tools.render import format_tool_to_openai_function
|
|
12
|
-
from langchain.prompts import ChatPromptTemplate
|
|
13
|
-
from langchain.chat_models import ChatOpenAI
|
|
14
|
-
from pydantic import BaseModel, Field, validator
|
|
15
|
-
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
|
|
16
|
-
from typing import Optional, Literal
|
|
17
|
-
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
|
|
18
|
-
from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
|
|
19
|
-
import idvpackage.genai_utils as genai_utils
|
|
20
|
-
import idvpackage.genai_utils as sanity_utils
|
|
21
|
-
from idvpackage.genai_utils import convert_pydantic_to_openai_function2
|
|
22
|
-
from datetime import datetime, timedelta
|
|
23
|
-
from dateutil.relativedelta import relativedelta
|
|
24
|
-
from pydantic import ValidationError
|
|
25
|
-
import logging
|
|
26
|
-
from langchain.schema.agent import AgentFinish
|
|
27
|
-
|
|
28
|
-
import openai
|
|
29
|
-
import json
|
|
30
|
-
|
|
31
|
-
import openai
|
|
32
|
-
import json
|
|
33
|
-
|
|
34
|
-
class Verify_LBN_Passport(BaseModel):
|
|
35
|
-
"""Validates whether a given OCR text represents a valid Lebanese Passport"""
|
|
36
|
-
is_valid_id: Literal["True", "False", ""] = Field(..., description="Return True if document is a valid Lebanese Passport"
|
|
37
|
-
"It should contain English Text: Republic of Lebanon"
|
|
38
|
-
"It should contain things a passport typically has like two-line mrz, date of issue, expiry, name, etc."
|
|
39
|
-
"Return False otherwise.")
|
|
40
|
-
|
|
41
|
-
@tool(args_schema=Verify_LBN_Passport)
|
|
42
|
-
def verify_lbn_passport(is_valid_id):
|
|
43
|
-
return is_valid_id
|
|
44
|
-
|
|
45
|
-
def route(result):
|
|
46
|
-
if isinstance(result, AgentFinish):
|
|
47
|
-
return result.return_values['output']
|
|
48
|
-
else:
|
|
49
|
-
tools = {
|
|
50
|
-
"verify_lbn_passport": verify_lbn_passport
|
|
51
|
-
}
|
|
52
|
-
return tools[result.tool].run(result.tool_input)
|
|
53
|
-
|
|
54
|
-
def verify_lbn_pss_chain(ocr_text, openai_key):
|
|
55
|
-
gpt_model = 'gpt-4o'
|
|
56
|
-
print("WE ARE IN EXTRACTION CHAIN")
|
|
57
|
-
|
|
58
|
-
prompt = ChatPromptTemplate.from_messages([
|
|
59
|
-
("system", "You are an expert at identifying Lebanese Passports."
|
|
60
|
-
),
|
|
61
|
-
("user", "{ocr_text}")
|
|
62
|
-
])
|
|
63
|
-
|
|
64
|
-
model = ChatOpenAI(model=gpt_model, temperature=0,
|
|
65
|
-
openai_api_key=openai_key)
|
|
66
|
-
functions = [convert_pydantic_to_openai_function(Verify_LBN_Passport)]
|
|
67
|
-
verification_model = model.bind(functions=functions)
|
|
68
|
-
verification_chain = prompt | verification_model | JsonOutputFunctionsParser()
|
|
69
|
-
|
|
70
|
-
result = verification_chain.invoke({"ocr_text":ocr_text})
|
|
71
|
-
is_valid_id = result.get("is_valid_id","")
|
|
72
|
-
return is_valid_id
|
|
73
|
-
|
|
74
|
-
def make_api_request_with_retries(prompt: str, max_retries: int = 3, delay_seconds: float = 2):
|
|
75
|
-
"""
|
|
76
|
-
Helper function to make API requests with retry logic using OpenAI
|
|
77
|
-
"""
|
|
78
|
-
for attempt in range(max_retries):
|
|
79
|
-
try:
|
|
80
|
-
response = openai.ChatCompletion.create(
|
|
81
|
-
model="gpt-4o",
|
|
82
|
-
temperature=0.4,
|
|
83
|
-
max_tokens=2000,
|
|
84
|
-
messages=[
|
|
85
|
-
{
|
|
86
|
-
"role": "user",
|
|
87
|
-
"content": prompt
|
|
88
|
-
}
|
|
89
|
-
]
|
|
90
|
-
)
|
|
91
|
-
logging.info(f"OpenAI successfully prompted.{response.choices[0].message.content} ")
|
|
92
|
-
result = response.choices[0].message.content
|
|
93
|
-
|
|
94
|
-
try:
|
|
95
|
-
return json.loads(result)
|
|
96
|
-
except json.JSONDecodeError:
|
|
97
|
-
try:
|
|
98
|
-
json_match = re.search(r'```(json|python|plaintext)?\s*(.*?)\s*```|\s*({.*?})', result, re.DOTALL)
|
|
99
|
-
if json_match:
|
|
100
|
-
json_str = json_match.group(2) or json_match.group(3)
|
|
101
|
-
try:
|
|
102
|
-
return json.loads(json_str)
|
|
103
|
-
except:
|
|
104
|
-
return eval(json_str.replace("'", '"'))
|
|
105
|
-
except:
|
|
106
|
-
pass
|
|
107
|
-
|
|
108
|
-
return json.loads(result)
|
|
109
|
-
|
|
110
|
-
except Exception as e:
|
|
111
|
-
print(f"Error during API request (attempt {attempt + 1} of {max_retries}): {str(e)}")
|
|
112
|
-
if attempt < max_retries - 1:
|
|
113
|
-
time.sleep(delay_seconds)
|
|
114
|
-
else:
|
|
115
|
-
raise Exception(f"Max retries exceeded. Last error: {str(e)}")
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def lebanon_passport_extraction(passport_data):
|
|
119
|
-
try:
|
|
120
|
-
prompt = f"From the attached text, please extract the data in a structured format, the response should be a dictionary, having first_name(only English), mother_name(mother english name if available, else empty string ''), father_name(only English), name which is last_name(only English), passport_number, dob(dd/mm/yyyy), place_of_birth, nationality(ISO 3166-1 alpha-3 country code), issue_date(dd/mm/yyyy), expiry_date(dd/mm/yyyy), gender(FEMALE, MALE), mrz1, mrz2, registry_place_and_issue_number(if not available then empty string ''). Note that the passport_number should always be 2 letters and 7 digits, if the length is less than 7 then append 0 in the start for passport_number_en and same way for passport_number_ar(numbers in passport arabic as well). Also note that the names should be extracted correctly, don't pick any random words for names, especially for first and last_name, it can be verified from the mrz1 string. So please make sure the names are correctly extracted. The structure of the response should be 'first_name', 'father_name', 'last_name', 'mother_name', 'id_number', 'dob', 'expiry_date', 'issue_date', 'place_of_birth', nationality, registry_place_and_issue_number, etc.. Make sure that the response should only contain a dictionary, and nothing else. Here's the text for your task: {passport_data}"
|
|
121
|
-
|
|
122
|
-
back_data = make_api_request_with_retries(prompt)
|
|
123
|
-
|
|
124
|
-
if back_data:
|
|
125
|
-
|
|
126
|
-
if back_data.get('registry_place_and_issue_number', ''):
|
|
127
|
-
back_data['registry_place_and_number'] = back_data.pop('registry_place_and_issue_number', '')
|
|
128
|
-
|
|
129
|
-
if back_data.get('passport_number', ''):
|
|
130
|
-
back_data['id_number'] = back_data.pop('passport_number', '')
|
|
131
|
-
|
|
132
|
-
if back_data.get('mrz1', '') and back_data.get('mrz2', ''):
|
|
133
|
-
back_data['mrz'] = back_data.get('mrz1', '') + back_data.get('mrz2', '')
|
|
134
|
-
|
|
135
|
-
back_data['issuing_country'] = 'LBN'
|
|
136
|
-
|
|
137
|
-
if not back_data.get('nationality', ''):
|
|
138
|
-
back_data['nationality'] = 'LBN'
|
|
139
|
-
|
|
140
|
-
if "gender" in back_data:
|
|
141
|
-
gender = back_data["gender"].strip().upper()
|
|
142
|
-
if gender == "F":
|
|
143
|
-
back_data["gender"] = "FEMALE"
|
|
144
|
-
elif gender == "M":
|
|
145
|
-
back_data["gender"] = "MALE"
|
|
146
|
-
|
|
147
|
-
if 'gender' in back_data:
|
|
148
|
-
back_data["gender"] = back_data["gender"].strip().upper()
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
except Exception as e:
|
|
152
|
-
print(f"Error in processing the extracted data: {e}")
|
|
153
|
-
back_data = {
|
|
154
|
-
'first_name': '',
|
|
155
|
-
'last_name': '',
|
|
156
|
-
'dob': '',
|
|
157
|
-
'place_of_birth': '',
|
|
158
|
-
'expiry_date': ''
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
return back_data
|
idvpackage/sau_id_extraction.py
DELETED
|
@@ -1,248 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import re
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from hijri_converter import convert
|
|
5
|
-
# from googletrans import Translator
|
|
6
|
-
import pycountry
|
|
7
|
-
import os
|
|
8
|
-
|
|
9
|
-
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "streamlit-connection-b1a38b694505 (1).json"
|
|
10
|
-
|
|
11
|
-
def get_country_code(name_in_foreign_language):
|
|
12
|
-
# translator = Translator()
|
|
13
|
-
# translated_country_name = translator.translate(name_in_foreign_language, src='auto', dest='en').text
|
|
14
|
-
|
|
15
|
-
# Manual mapping for known challenges or discrepancies in translation
|
|
16
|
-
manual_mapping = {
|
|
17
|
-
'ميانمار': 'MMR', # Myanmar
|
|
18
|
-
'البحرين': 'BHR', # Bahrain
|
|
19
|
-
'اليمن الجنوبي': 'YEM', # Yemen
|
|
20
|
-
'فلسطين': 'PSE', # Palestine
|
|
21
|
-
'الفلبين': 'PHL', # Philippines
|
|
22
|
-
'أفغانستان': 'AFG', # Afghanistan
|
|
23
|
-
'سوريا': 'SYR', # Syria
|
|
24
|
-
'سري لنكا': 'LKA' # Sri Lanka
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
if name_in_foreign_language in manual_mapping:
|
|
28
|
-
return manual_mapping[name_in_foreign_language]
|
|
29
|
-
|
|
30
|
-
# try:
|
|
31
|
-
# country = pycountry.countries.get(name=translated_country_name)
|
|
32
|
-
# return country.alpha_3
|
|
33
|
-
# except AttributeError:
|
|
34
|
-
# return "Country Not Found"
|
|
35
|
-
|
|
36
|
-
pattern = r'(\d{4}/\d{1,2}/\d{1,2}|[۰-۹]{4}/[۰-۹]{1,2}/[۰-۹]{1,2})'
|
|
37
|
-
def eastern_arabic_to_english(eastern_numeral):
|
|
38
|
-
arabic_to_english_map = {
|
|
39
|
-
'٠': '0', '۰': '0',
|
|
40
|
-
'١': '1', '۱': '1',
|
|
41
|
-
'٢': '2', '۲': '2',
|
|
42
|
-
'٣': '3', '۳': '3',
|
|
43
|
-
'٤': '4', '۴': '4',
|
|
44
|
-
'٥': '5', '۵': '5',
|
|
45
|
-
'٦': '6', '۶': '6',
|
|
46
|
-
'٧': '7', '۷': '7',
|
|
47
|
-
'٨': '8', '۸': '8',
|
|
48
|
-
'٩': '9', '۹': '9',
|
|
49
|
-
'/': '/'
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
# If the character is an Eastern Arabic numeral, convert it to English; otherwise, keep it unchanged.
|
|
53
|
-
english_numeral = ''.join([arabic_to_english_map[char] if char in arabic_to_english_map else char for char in eastern_numeral])
|
|
54
|
-
|
|
55
|
-
return english_numeral
|
|
56
|
-
|
|
57
|
-
def distinguish_dates(date_list):
|
|
58
|
-
today = datetime.now().date()
|
|
59
|
-
|
|
60
|
-
# Calculate the difference between each date and today's date
|
|
61
|
-
differences = [(abs((today - datetime.strptime(date, '%Y/%m/%d').date()).days), date) for date in date_list]
|
|
62
|
-
|
|
63
|
-
# Sort by difference
|
|
64
|
-
differences.sort(key=lambda x: x[0])
|
|
65
|
-
|
|
66
|
-
# The date with the smallest difference is considered Gregorian, and the one with the largest difference is considered Hijri
|
|
67
|
-
gregorian_date = differences[0][1]
|
|
68
|
-
hijri_date = differences[-1][1]
|
|
69
|
-
|
|
70
|
-
return hijri_date, gregorian_date
|
|
71
|
-
|
|
72
|
-
def hijri_to_gregorian(hijri_date):
|
|
73
|
-
try:
|
|
74
|
-
# Split the hijri date
|
|
75
|
-
year, month, day = map(int, hijri_date.split('/'))
|
|
76
|
-
|
|
77
|
-
# Convert the hijri date to Gregorian
|
|
78
|
-
gregorian_date = convert.Hijri(year, month, day).to_gregorian()
|
|
79
|
-
|
|
80
|
-
# Format the result as a string
|
|
81
|
-
return f"{gregorian_date.year}/{gregorian_date.month:02}/{gregorian_date.day:02}"
|
|
82
|
-
except:
|
|
83
|
-
return hijri_date
|
|
84
|
-
|
|
85
|
-
def extract_dates(input_list):
|
|
86
|
-
# Regex pattern to match YYYY/MM/DD, YYYY/MM/DD in Arabic numerals,
|
|
87
|
-
# and some other variations found in the list
|
|
88
|
-
pattern = r"(\d{4}/\d{2}/\d{2}|[۰۱۲۳۴۵۶۷۸۹]{4}/[۰۱۲۳۴۵۶۷۸۹]{2}/[۰۱۲۳۴۵۶۷۸۹]{2})"
|
|
89
|
-
|
|
90
|
-
extracted_dates = []
|
|
91
|
-
for item in input_list:
|
|
92
|
-
match = re.search(pattern, item)
|
|
93
|
-
if match:
|
|
94
|
-
extracted_dates.append(match.group(0))
|
|
95
|
-
else:
|
|
96
|
-
extracted_dates.append('')
|
|
97
|
-
return extracted_dates
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def detect_script(word):
|
|
101
|
-
arabic_chars = range(0x0600, 0x06FF) # Arabic Unicode Block
|
|
102
|
-
english_chars = range(0x0041, 0x007A) # English uppercase Unicode Block
|
|
103
|
-
english_chars_lower = range(0x0061, 0x007A) # English lowercase Unicode Block
|
|
104
|
-
|
|
105
|
-
has_arabic = any(ord(char) in arabic_chars for char in word)
|
|
106
|
-
has_english = any(ord(char) in english_chars or ord(char) in english_chars_lower for char in word)
|
|
107
|
-
|
|
108
|
-
if has_arabic and has_english:
|
|
109
|
-
return "Mixed"
|
|
110
|
-
elif has_arabic:
|
|
111
|
-
return "Arabic"
|
|
112
|
-
elif has_english:
|
|
113
|
-
return "English"
|
|
114
|
-
else:
|
|
115
|
-
return "Other"
|
|
116
|
-
|
|
117
|
-
def extract_english_strings(data):
|
|
118
|
-
english_strings = []
|
|
119
|
-
for string in data:
|
|
120
|
-
if not re.search("[\u0600-\u06FF\d]", string): # Filters out strings containing Arabic letters or digits
|
|
121
|
-
english_strings.append(string)
|
|
122
|
-
return english_strings
|
|
123
|
-
|
|
124
|
-
def extract_arabic_strings(data):
|
|
125
|
-
arabic_strings = []
|
|
126
|
-
for string in data:
|
|
127
|
-
if re.search(r"[\u0600-\u06FF\d]", string): # Filters out strings containing Arabic letters or digits
|
|
128
|
-
arabic_strings.append(string)
|
|
129
|
-
return arabic_strings
|
|
130
|
-
|
|
131
|
-
def clean_special_chars(data):
|
|
132
|
-
cleaned_data = []
|
|
133
|
-
for string in data:
|
|
134
|
-
cleaned_string = re.sub(r'[^A-Za-z\s]', '', string) # Retains only alphabets and spaces
|
|
135
|
-
cleaned_data.append(cleaned_string.strip()) # .strip() removes any leading or trailing spaces
|
|
136
|
-
return cleaned_data
|
|
137
|
-
|
|
138
|
-
def extract_id_details(result):
|
|
139
|
-
# result = detect_text(uploaded_id)
|
|
140
|
-
df = pd.DataFrame({'res':[result]})
|
|
141
|
-
pattern = r'(\d{4}/\d{1,2}/\d{1,2}|[۰-۹]{4}/[۰-۹]{1,2}/[۰-۹]{1,2})'
|
|
142
|
-
i = 0
|
|
143
|
-
df['Extracted_data']=''
|
|
144
|
-
try:
|
|
145
|
-
nationality=[ele for ele in [ele for ele in df['res'].iloc[i] if 'الجنسية' in ele ][0].split('الجنسية') if ele!=''][0].strip()
|
|
146
|
-
|
|
147
|
-
nationality=get_country_code(nationality)
|
|
148
|
-
except:
|
|
149
|
-
nationality=''
|
|
150
|
-
try:
|
|
151
|
-
## employer
|
|
152
|
-
employer_ar=[ele for ele in [ele for ele in df['res'].iloc[i] if 'صاحب العمل' in ele ]][0]
|
|
153
|
-
employer=[ele for ele in employer_ar.split('صاحب العمل') if ele!=''][0].strip()
|
|
154
|
-
|
|
155
|
-
except:
|
|
156
|
-
employer=''
|
|
157
|
-
try:
|
|
158
|
-
### issuing place
|
|
159
|
-
issuing_place_ar=[ele for ele in df['res'].iloc[i] if 'مكان الإصدار' in ele][0]
|
|
160
|
-
issuing_place=issuing_place_ar.split('مكان الإصدار')[-1].strip()
|
|
161
|
-
except:
|
|
162
|
-
issuing_place=''
|
|
163
|
-
try:
|
|
164
|
-
comon_pattern=[ele for ele in [ele for ele in df['res'].iloc[i] if (('الإصدار' in ele ) and('مكان' not in ele))][0].split('الإصدار') if ele!=''][0].strip()
|
|
165
|
-
matches = re.findall(pattern, comon_pattern)
|
|
166
|
-
|
|
167
|
-
matches=[eastern_arabic_to_english(ele) for ele in matches]
|
|
168
|
-
|
|
169
|
-
issuing_date, dob=matches[0],matches[1]
|
|
170
|
-
|
|
171
|
-
#issuing_date = hijri_to_gregorian(issuing_date)
|
|
172
|
-
|
|
173
|
-
except:
|
|
174
|
-
|
|
175
|
-
try:
|
|
176
|
-
dob=[ele for ele in [ele for ele in df['res'].iloc[i] if 'الميلاد' in ele ][0].split('الميلاد') if ele!=''][0].strip()
|
|
177
|
-
issuing_date= [ele for ele in [ele for ele in df['res'].iloc[i] if( 'الإصدار' in ele) and ('مكان' not in ele ) ][0].split('الإصدار') if ele!=''][0].strip()
|
|
178
|
-
#issuing_date=hijri_to_gregorian(issuing_date)
|
|
179
|
-
except:
|
|
180
|
-
try:
|
|
181
|
-
dob=[ele for ele in [ele for ele in df['res'].iloc[i] if 'الميلاد' in ele ][0].split('الميلاد') if ele!=''][-1].strip()
|
|
182
|
-
issuing_date=[ele for ele in [ele for ele in df['res'].iloc[i] if 'الميلاد' in ele ][0].split('الميلاد') if ele!=''][0].strip('الانتهاء').strip()
|
|
183
|
-
#issuing_date=hijri_to_gregorian(issuing_date)
|
|
184
|
-
except:
|
|
185
|
-
issuing_date,dob='',''
|
|
186
|
-
|
|
187
|
-
try:
|
|
188
|
-
|
|
189
|
-
#issuing_date_ar,dob_ar=re.findall(pattern, comon_pattern)
|
|
190
|
-
|
|
191
|
-
### Id Number
|
|
192
|
-
id_number=[item for item in df['res'].iloc[i] if re.fullmatch(r'\d{10}', item)][0]
|
|
193
|
-
id_number=eastern_arabic_to_english(id_number)
|
|
194
|
-
|
|
195
|
-
except:
|
|
196
|
-
|
|
197
|
-
try:
|
|
198
|
-
id_number = [ele for ele in eastern_arabic_to_english([ele for ele in [ele for ele in df['res'].iloc[i] if 'الرقم' in ele ][0].split('الرقم') if ele!=''][0].strip()).split(' ') if len(ele)==10][0]
|
|
199
|
-
id_number=eastern_arabic_to_english(id_number)
|
|
200
|
-
except:
|
|
201
|
-
id_number=''
|
|
202
|
-
|
|
203
|
-
try:
|
|
204
|
-
profession_Ar=[ele for ele in [ele for ele in df['res'].iloc[i] if 'المهنة' in ele ]][0]
|
|
205
|
-
|
|
206
|
-
profession=[ele for ele in profession_Ar.split('المهنة') if ele!=''][-1]
|
|
207
|
-
|
|
208
|
-
except:
|
|
209
|
-
profession=''
|
|
210
|
-
try:
|
|
211
|
-
Name_Index=[extract_arabic_strings(df['res'].iloc[i]).index(ele) for ele in extract_arabic_strings(df['res'].iloc[i]) if 'وزارة' in ele][0]
|
|
212
|
-
Name_1=extract_arabic_strings(df['res'].iloc[i])[Name_Index+1]
|
|
213
|
-
Name_length=len(Name_1.split(' '))
|
|
214
|
-
Name_en=max([ele for ele in clean_special_chars(extract_english_strings(df['res'].iloc[i])) if ele not in ['KINGDOM OF SAUDI ARABIA','MINISTRY OF INTERIOR']], key=lambda x: x.count(' '))
|
|
215
|
-
Name_ar=[ele for ele in [Name_1,Name_en] if ele!=Name_en][0]
|
|
216
|
-
|
|
217
|
-
except:
|
|
218
|
-
|
|
219
|
-
Name_en,Name_ar='',''
|
|
220
|
-
|
|
221
|
-
df['Extracted_data'].iloc[i]=[nationality,employer,issuing_date, dob,id_number,profession,Name_en,Name_ar]
|
|
222
|
-
|
|
223
|
-
cols = ['nationality', 'employer', 'issuing_date', 'dob', 'id_number', 'profession', 'Name_en', 'Name_ar']
|
|
224
|
-
|
|
225
|
-
for index, col_name in enumerate(cols):
|
|
226
|
-
df[col_name] = df['Extracted_data'].apply(lambda x: x[index])
|
|
227
|
-
|
|
228
|
-
df['dob']=extract_dates(df['dob'].tolist())
|
|
229
|
-
|
|
230
|
-
df['dob']=df['dob'].apply(lambda x: eastern_arabic_to_english(x))
|
|
231
|
-
|
|
232
|
-
df['issuing_date']=df['issuing_date'].apply(lambda x: eastern_arabic_to_english(x))
|
|
233
|
-
|
|
234
|
-
df['issuing_date']=df['issuing_date'].apply(lambda x: hijri_to_gregorian(x))
|
|
235
|
-
|
|
236
|
-
dob = df['dob'].iloc[0]
|
|
237
|
-
|
|
238
|
-
if dob:
|
|
239
|
-
parsed_date = datetime.strptime(dob, "%Y/%m/%d")
|
|
240
|
-
dob = parsed_date.strftime("%d/%m/%Y")
|
|
241
|
-
|
|
242
|
-
# df['gender']= ''
|
|
243
|
-
# df['expiry_data']= ' '
|
|
244
|
-
|
|
245
|
-
# print(df)
|
|
246
|
-
# TODO: gender, expiry_data
|
|
247
|
-
return {'id_number': df['id_number'].iloc[0], 'nationality': df['nationality'].iloc[0], 'gender': '', 'dob': dob, 'expiry_date': '', 'name': df['Name_en'].iloc[0], 'occupation': df['profession'].iloc[0], 'employer': df['employer'].iloc[0]}
|
|
248
|
-
|