idvpackage 3.0.11__py3-none-any.whl → 3.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,161 +0,0 @@
1
- import json
2
- import time
3
- import openai
4
- import re
5
- import logging
6
- import json
7
- import time
8
- import datetime
9
- import openai
10
- from langchain.tools import tool
11
- from langchain.tools.render import format_tool_to_openai_function
12
- from langchain.prompts import ChatPromptTemplate
13
- from langchain.chat_models import ChatOpenAI
14
- from pydantic import BaseModel, Field, validator
15
- from langchain.utils.openai_functions import convert_pydantic_to_openai_function
16
- from typing import Optional, Literal
17
- from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
18
- from langchain.agents.output_parsers import OpenAIFunctionsAgentOutputParser
19
- import idvpackage.genai_utils as genai_utils
20
- import idvpackage.genai_utils as sanity_utils
21
- from idvpackage.genai_utils import convert_pydantic_to_openai_function2
22
- from datetime import datetime, timedelta
23
- from dateutil.relativedelta import relativedelta
24
- from pydantic import ValidationError
25
- import logging
26
- from langchain.schema.agent import AgentFinish
27
-
28
- import openai
29
- import json
30
-
31
- import openai
32
- import json
33
-
34
- class Verify_LBN_Passport(BaseModel):
35
- """Validates whether a given OCR text represents a valid Lebanese Passport"""
36
- is_valid_id: Literal["True", "False", ""] = Field(..., description="Return True if document is a valid Lebanese Passport"
37
- "It should contain English Text: Republic of Lebanon"
38
- "It should contain things a passport typically has like two-line mrz, date of issue, expiry, name, etc."
39
- "Return False otherwise.")
40
-
41
- @tool(args_schema=Verify_LBN_Passport)
42
- def verify_lbn_passport(is_valid_id):
43
- return is_valid_id
44
-
45
- def route(result):
46
- if isinstance(result, AgentFinish):
47
- return result.return_values['output']
48
- else:
49
- tools = {
50
- "verify_lbn_passport": verify_lbn_passport
51
- }
52
- return tools[result.tool].run(result.tool_input)
53
-
54
- def verify_lbn_pss_chain(ocr_text, openai_key):
55
- gpt_model = 'gpt-4o'
56
- print("WE ARE IN EXTRACTION CHAIN")
57
-
58
- prompt = ChatPromptTemplate.from_messages([
59
- ("system", "You are an expert at identifying Lebanese Passports."
60
- ),
61
- ("user", "{ocr_text}")
62
- ])
63
-
64
- model = ChatOpenAI(model=gpt_model, temperature=0,
65
- openai_api_key=openai_key)
66
- functions = [convert_pydantic_to_openai_function(Verify_LBN_Passport)]
67
- verification_model = model.bind(functions=functions)
68
- verification_chain = prompt | verification_model | JsonOutputFunctionsParser()
69
-
70
- result = verification_chain.invoke({"ocr_text":ocr_text})
71
- is_valid_id = result.get("is_valid_id","")
72
- return is_valid_id
73
-
74
- def make_api_request_with_retries(prompt: str, max_retries: int = 3, delay_seconds: float = 2):
75
- """
76
- Helper function to make API requests with retry logic using OpenAI
77
- """
78
- for attempt in range(max_retries):
79
- try:
80
- response = openai.ChatCompletion.create(
81
- model="gpt-4o",
82
- temperature=0.4,
83
- max_tokens=2000,
84
- messages=[
85
- {
86
- "role": "user",
87
- "content": prompt
88
- }
89
- ]
90
- )
91
- logging.info(f"OpenAI successfully prompted.{response.choices[0].message.content} ")
92
- result = response.choices[0].message.content
93
-
94
- try:
95
- return json.loads(result)
96
- except json.JSONDecodeError:
97
- try:
98
- json_match = re.search(r'```(json|python|plaintext)?\s*(.*?)\s*```|\s*({.*?})', result, re.DOTALL)
99
- if json_match:
100
- json_str = json_match.group(2) or json_match.group(3)
101
- try:
102
- return json.loads(json_str)
103
- except:
104
- return eval(json_str.replace("'", '"'))
105
- except:
106
- pass
107
-
108
- return json.loads(result)
109
-
110
- except Exception as e:
111
- print(f"Error during API request (attempt {attempt + 1} of {max_retries}): {str(e)}")
112
- if attempt < max_retries - 1:
113
- time.sleep(delay_seconds)
114
- else:
115
- raise Exception(f"Max retries exceeded. Last error: {str(e)}")
116
-
117
-
118
- def lebanon_passport_extraction(passport_data):
119
- try:
120
- prompt = f"From the attached text, please extract the data in a structured format, the response should be a dictionary, having first_name(only English), mother_name(mother english name if available, else empty string ''), father_name(only English), name which is last_name(only English), passport_number, dob(dd/mm/yyyy), place_of_birth, nationality(ISO 3166-1 alpha-3 country code), issue_date(dd/mm/yyyy), expiry_date(dd/mm/yyyy), gender(FEMALE, MALE), mrz1, mrz2, registry_place_and_issue_number(if not available then empty string ''). Note that the passport_number should always be 2 letters and 7 digits, if the length is less than 7 then append 0 in the start for passport_number_en and same way for passport_number_ar(numbers in passport arabic as well). Also note that the names should be extracted correctly, don't pick any random words for names, especially for first and last_name, it can be verified from the mrz1 string. So please make sure the names are correctly extracted. The structure of the response should be 'first_name', 'father_name', 'last_name', 'mother_name', 'id_number', 'dob', 'expiry_date', 'issue_date', 'place_of_birth', nationality, registry_place_and_issue_number, etc.. Make sure that the response should only contain a dictionary, and nothing else. Here's the text for your task: {passport_data}"
121
-
122
- back_data = make_api_request_with_retries(prompt)
123
-
124
- if back_data:
125
-
126
- if back_data.get('registry_place_and_issue_number', ''):
127
- back_data['registry_place_and_number'] = back_data.pop('registry_place_and_issue_number', '')
128
-
129
- if back_data.get('passport_number', ''):
130
- back_data['id_number'] = back_data.pop('passport_number', '')
131
-
132
- if back_data.get('mrz1', '') and back_data.get('mrz2', ''):
133
- back_data['mrz'] = back_data.get('mrz1', '') + back_data.get('mrz2', '')
134
-
135
- back_data['issuing_country'] = 'LBN'
136
-
137
- if not back_data.get('nationality', ''):
138
- back_data['nationality'] = 'LBN'
139
-
140
- if "gender" in back_data:
141
- gender = back_data["gender"].strip().upper()
142
- if gender == "F":
143
- back_data["gender"] = "FEMALE"
144
- elif gender == "M":
145
- back_data["gender"] = "MALE"
146
-
147
- if 'gender' in back_data:
148
- back_data["gender"] = back_data["gender"].strip().upper()
149
-
150
-
151
- except Exception as e:
152
- print(f"Error in processing the extracted data: {e}")
153
- back_data = {
154
- 'first_name': '',
155
- 'last_name': '',
156
- 'dob': '',
157
- 'place_of_birth': '',
158
- 'expiry_date': ''
159
- }
160
-
161
- return back_data
@@ -1,248 +0,0 @@
1
- import pandas as pd
2
- import re
3
- from datetime import datetime
4
- from hijri_converter import convert
5
- # from googletrans import Translator
6
- import pycountry
7
- import os
8
-
9
- os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "streamlit-connection-b1a38b694505 (1).json"
10
-
11
- def get_country_code(name_in_foreign_language):
12
- # translator = Translator()
13
- # translated_country_name = translator.translate(name_in_foreign_language, src='auto', dest='en').text
14
-
15
- # Manual mapping for known challenges or discrepancies in translation
16
- manual_mapping = {
17
- 'ميانمار': 'MMR', # Myanmar
18
- 'البحرين': 'BHR', # Bahrain
19
- 'اليمن الجنوبي': 'YEM', # Yemen
20
- 'فلسطين': 'PSE', # Palestine
21
- 'الفلبين': 'PHL', # Philippines
22
- 'أفغانستان': 'AFG', # Afghanistan
23
- 'سوريا': 'SYR', # Syria
24
- 'سري لنكا': 'LKA' # Sri Lanka
25
- }
26
-
27
- if name_in_foreign_language in manual_mapping:
28
- return manual_mapping[name_in_foreign_language]
29
-
30
- # try:
31
- # country = pycountry.countries.get(name=translated_country_name)
32
- # return country.alpha_3
33
- # except AttributeError:
34
- # return "Country Not Found"
35
-
36
- pattern = r'(\d{4}/\d{1,2}/\d{1,2}|[۰-۹]{4}/[۰-۹]{1,2}/[۰-۹]{1,2})'
37
- def eastern_arabic_to_english(eastern_numeral):
38
- arabic_to_english_map = {
39
- '٠': '0', '۰': '0',
40
- '١': '1', '۱': '1',
41
- '٢': '2', '۲': '2',
42
- '٣': '3', '۳': '3',
43
- '٤': '4', '۴': '4',
44
- '٥': '5', '۵': '5',
45
- '٦': '6', '۶': '6',
46
- '٧': '7', '۷': '7',
47
- '٨': '8', '۸': '8',
48
- '٩': '9', '۹': '9',
49
- '/': '/'
50
- }
51
-
52
- # If the character is an Eastern Arabic numeral, convert it to English; otherwise, keep it unchanged.
53
- english_numeral = ''.join([arabic_to_english_map[char] if char in arabic_to_english_map else char for char in eastern_numeral])
54
-
55
- return english_numeral
56
-
57
- def distinguish_dates(date_list):
58
- today = datetime.now().date()
59
-
60
- # Calculate the difference between each date and today's date
61
- differences = [(abs((today - datetime.strptime(date, '%Y/%m/%d').date()).days), date) for date in date_list]
62
-
63
- # Sort by difference
64
- differences.sort(key=lambda x: x[0])
65
-
66
- # The date with the smallest difference is considered Gregorian, and the one with the largest difference is considered Hijri
67
- gregorian_date = differences[0][1]
68
- hijri_date = differences[-1][1]
69
-
70
- return hijri_date, gregorian_date
71
-
72
- def hijri_to_gregorian(hijri_date):
73
- try:
74
- # Split the hijri date
75
- year, month, day = map(int, hijri_date.split('/'))
76
-
77
- # Convert the hijri date to Gregorian
78
- gregorian_date = convert.Hijri(year, month, day).to_gregorian()
79
-
80
- # Format the result as a string
81
- return f"{gregorian_date.year}/{gregorian_date.month:02}/{gregorian_date.day:02}"
82
- except:
83
- return hijri_date
84
-
85
- def extract_dates(input_list):
86
- # Regex pattern to match YYYY/MM/DD, YYYY/MM/DD in Arabic numerals,
87
- # and some other variations found in the list
88
- pattern = r"(\d{4}/\d{2}/\d{2}|[۰۱۲۳۴۵۶۷۸۹]{4}/[۰۱۲۳۴۵۶۷۸۹]{2}/[۰۱۲۳۴۵۶۷۸۹]{2})"
89
-
90
- extracted_dates = []
91
- for item in input_list:
92
- match = re.search(pattern, item)
93
- if match:
94
- extracted_dates.append(match.group(0))
95
- else:
96
- extracted_dates.append('')
97
- return extracted_dates
98
-
99
-
100
- def detect_script(word):
101
- arabic_chars = range(0x0600, 0x06FF) # Arabic Unicode Block
102
- english_chars = range(0x0041, 0x007A) # English uppercase Unicode Block
103
- english_chars_lower = range(0x0061, 0x007A) # English lowercase Unicode Block
104
-
105
- has_arabic = any(ord(char) in arabic_chars for char in word)
106
- has_english = any(ord(char) in english_chars or ord(char) in english_chars_lower for char in word)
107
-
108
- if has_arabic and has_english:
109
- return "Mixed"
110
- elif has_arabic:
111
- return "Arabic"
112
- elif has_english:
113
- return "English"
114
- else:
115
- return "Other"
116
-
117
- def extract_english_strings(data):
118
- english_strings = []
119
- for string in data:
120
- if not re.search("[\u0600-\u06FF\d]", string): # Filters out strings containing Arabic letters or digits
121
- english_strings.append(string)
122
- return english_strings
123
-
124
- def extract_arabic_strings(data):
125
- arabic_strings = []
126
- for string in data:
127
- if re.search(r"[\u0600-\u06FF\d]", string): # Filters out strings containing Arabic letters or digits
128
- arabic_strings.append(string)
129
- return arabic_strings
130
-
131
- def clean_special_chars(data):
132
- cleaned_data = []
133
- for string in data:
134
- cleaned_string = re.sub(r'[^A-Za-z\s]', '', string) # Retains only alphabets and spaces
135
- cleaned_data.append(cleaned_string.strip()) # .strip() removes any leading or trailing spaces
136
- return cleaned_data
137
-
138
- def extract_id_details(result):
139
- # result = detect_text(uploaded_id)
140
- df = pd.DataFrame({'res':[result]})
141
- pattern = r'(\d{4}/\d{1,2}/\d{1,2}|[۰-۹]{4}/[۰-۹]{1,2}/[۰-۹]{1,2})'
142
- i = 0
143
- df['Extracted_data']=''
144
- try:
145
- nationality=[ele for ele in [ele for ele in df['res'].iloc[i] if 'الجنسية' in ele ][0].split('الجنسية') if ele!=''][0].strip()
146
-
147
- nationality=get_country_code(nationality)
148
- except:
149
- nationality=''
150
- try:
151
- ## employer
152
- employer_ar=[ele for ele in [ele for ele in df['res'].iloc[i] if 'صاحب العمل' in ele ]][0]
153
- employer=[ele for ele in employer_ar.split('صاحب العمل') if ele!=''][0].strip()
154
-
155
- except:
156
- employer=''
157
- try:
158
- ### issuing place
159
- issuing_place_ar=[ele for ele in df['res'].iloc[i] if 'مكان الإصدار' in ele][0]
160
- issuing_place=issuing_place_ar.split('مكان الإصدار')[-1].strip()
161
- except:
162
- issuing_place=''
163
- try:
164
- comon_pattern=[ele for ele in [ele for ele in df['res'].iloc[i] if (('الإصدار' in ele ) and('مكان' not in ele))][0].split('الإصدار') if ele!=''][0].strip()
165
- matches = re.findall(pattern, comon_pattern)
166
-
167
- matches=[eastern_arabic_to_english(ele) for ele in matches]
168
-
169
- issuing_date, dob=matches[0],matches[1]
170
-
171
- #issuing_date = hijri_to_gregorian(issuing_date)
172
-
173
- except:
174
-
175
- try:
176
- dob=[ele for ele in [ele for ele in df['res'].iloc[i] if 'الميلاد' in ele ][0].split('الميلاد') if ele!=''][0].strip()
177
- issuing_date= [ele for ele in [ele for ele in df['res'].iloc[i] if( 'الإصدار' in ele) and ('مكان' not in ele ) ][0].split('الإصدار') if ele!=''][0].strip()
178
- #issuing_date=hijri_to_gregorian(issuing_date)
179
- except:
180
- try:
181
- dob=[ele for ele in [ele for ele in df['res'].iloc[i] if 'الميلاد' in ele ][0].split('الميلاد') if ele!=''][-1].strip()
182
- issuing_date=[ele for ele in [ele for ele in df['res'].iloc[i] if 'الميلاد' in ele ][0].split('الميلاد') if ele!=''][0].strip('الانتهاء').strip()
183
- #issuing_date=hijri_to_gregorian(issuing_date)
184
- except:
185
- issuing_date,dob='',''
186
-
187
- try:
188
-
189
- #issuing_date_ar,dob_ar=re.findall(pattern, comon_pattern)
190
-
191
- ### Id Number
192
- id_number=[item for item in df['res'].iloc[i] if re.fullmatch(r'\d{10}', item)][0]
193
- id_number=eastern_arabic_to_english(id_number)
194
-
195
- except:
196
-
197
- try:
198
- id_number = [ele for ele in eastern_arabic_to_english([ele for ele in [ele for ele in df['res'].iloc[i] if 'الرقم' in ele ][0].split('الرقم') if ele!=''][0].strip()).split(' ') if len(ele)==10][0]
199
- id_number=eastern_arabic_to_english(id_number)
200
- except:
201
- id_number=''
202
-
203
- try:
204
- profession_Ar=[ele for ele in [ele for ele in df['res'].iloc[i] if 'المهنة' in ele ]][0]
205
-
206
- profession=[ele for ele in profession_Ar.split('المهنة') if ele!=''][-1]
207
-
208
- except:
209
- profession=''
210
- try:
211
- Name_Index=[extract_arabic_strings(df['res'].iloc[i]).index(ele) for ele in extract_arabic_strings(df['res'].iloc[i]) if 'وزارة' in ele][0]
212
- Name_1=extract_arabic_strings(df['res'].iloc[i])[Name_Index+1]
213
- Name_length=len(Name_1.split(' '))
214
- Name_en=max([ele for ele in clean_special_chars(extract_english_strings(df['res'].iloc[i])) if ele not in ['KINGDOM OF SAUDI ARABIA','MINISTRY OF INTERIOR']], key=lambda x: x.count(' '))
215
- Name_ar=[ele for ele in [Name_1,Name_en] if ele!=Name_en][0]
216
-
217
- except:
218
-
219
- Name_en,Name_ar='',''
220
-
221
- df['Extracted_data'].iloc[i]=[nationality,employer,issuing_date, dob,id_number,profession,Name_en,Name_ar]
222
-
223
- cols = ['nationality', 'employer', 'issuing_date', 'dob', 'id_number', 'profession', 'Name_en', 'Name_ar']
224
-
225
- for index, col_name in enumerate(cols):
226
- df[col_name] = df['Extracted_data'].apply(lambda x: x[index])
227
-
228
- df['dob']=extract_dates(df['dob'].tolist())
229
-
230
- df['dob']=df['dob'].apply(lambda x: eastern_arabic_to_english(x))
231
-
232
- df['issuing_date']=df['issuing_date'].apply(lambda x: eastern_arabic_to_english(x))
233
-
234
- df['issuing_date']=df['issuing_date'].apply(lambda x: hijri_to_gregorian(x))
235
-
236
- dob = df['dob'].iloc[0]
237
-
238
- if dob:
239
- parsed_date = datetime.strptime(dob, "%Y/%m/%d")
240
- dob = parsed_date.strftime("%d/%m/%Y")
241
-
242
- # df['gender']= ''
243
- # df['expiry_data']= ' '
244
-
245
- # print(df)
246
- # TODO: gender, expiry_data
247
- return {'id_number': df['id_number'].iloc[0], 'nationality': df['nationality'].iloc[0], 'gender': '', 'dob': dob, 'expiry_date': '', 'name': df['Name_en'].iloc[0], 'occupation': df['profession'].iloc[0], 'employer': df['employer'].iloc[0]}
248
-