archive-ai 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
archive_ai/ai.py CHANGED
@@ -1,213 +1,251 @@
1
- # Fundemental imports
2
- #####################################################################
3
- from django.utils import timezone
4
- from django.conf import settings
5
- from django.apps import apps
6
- from django.core.exceptions import FieldDoesNotExist
7
- import difflib
8
- import os
9
- import re
10
- import json
11
- import requests
12
- import logging
13
-
14
- logger = logging.getLogger(__name__)
15
-
16
- def extract_metadata_with_llm(model_name, ocr_text):
17
- url = settings.LLM_API_URL
18
-
19
- system_prompt = """
20
- You are a strict JSON generator.
21
-
22
- Analyze Arabic OCR text and extract structured metadata for one document.
23
-
24
- Respond with EXACTLY one JSON object.
25
- No explanations, no comments, no markdown, no text outside the JSON.
26
-
27
- Current document type: {model_name}
28
-
29
- Output format:
30
- {
31
- "title": "",
32
- "body": "",
33
-
34
- "document_number": "",
35
- "og_document_number": "",
36
-
37
- "document_date": "",
38
- "og_document_date": "",
39
-
40
- "source": "",
41
- "destination": ""
42
- }
43
-
44
- Rules:
45
-
46
- 1. Document types:
47
- - decree: formal decree, numbered articles, legal references, decree number, subject, footer date.
48
- - incoming: original letter + received stamp (usually 2 dates, 2 numbers), source and destination, letter body.
49
- - outgoing / internal / other: usually one date and one number. Source is a ministry department, destination is another entity.
50
-
51
- 2. Dates:
52
- - One date → document_date
53
- - Two dates → older (letter header) = og_document_date, newer (stamp) = document_date
54
-
55
- 3. Numbers:
56
- - document_number:
57
- - outgoing/internal/other → official document number
58
- - incoming → received stamp number
59
- - og_document_number:
60
- - incoming only → original document number
61
- - Number fields MUST always contain digits only.
62
- - DO NOT use Eastern Arabic numerals (like ٠١٢٣) or any other numeral system.
63
- - DO NOT include letters. If OCR included letters with the numbers, REMOVE the letters and keep the numbers.
64
-
65
- 4. Source & Destination:
66
- - Extract real entity names only. Clean OCR noise but do NOT invent names.
67
- - Incoming source: organization in the header.
68
- - Outgoing source: department under the signer’s name.
69
- - Destination often appears near the start, commonly after the words sir or sirs and a "/".
70
-
71
- 5. Title:
72
- - Extract explicit subject/title if present (will always be in the beggining if present).
73
- - Otherwise infer a short, accurate phrase from the document text.
74
-
75
- 6. Body:
76
- - Return the full cleaned document text with OCR noise removed.
77
-
78
- Always fill all fields.
79
- If unknown or not applicable, use an empty string "".
80
-
81
- Return ONLY the JSON object.
82
- """
83
-
84
- user_prompt = f"""
85
- OCR TEXT:
86
- \"\"\"
87
- {ocr_text}
88
- \"\"\"
89
- """
90
-
91
- payload = {
92
- "model": "/models/" + settings.LLM_MODEL,
93
- "messages": [
94
- {"role": "system", "content": system_prompt},
95
- {"role": "user", "content": user_prompt}
96
- ],
97
- "temperature": 0.5,
98
- "max_tokens": 1024,
99
- "response_format": {"type": "json_object"},
100
- "top_p": 0.8,
101
- "min_p": 0,
102
- "top_k": 20,
103
- "presence_penalty": 0.5,
104
- }
105
-
106
- # ------------------------------------------------
107
- # Call the LLM API
108
- # ------------------------------------------------
109
- try:
110
- response = requests.post(url, json=payload, timeout=(10, 1200))
111
- response.raise_for_status()
112
- except requests.exceptions.RequestException:
113
- raise
114
-
115
- data = response.json()
116
-
117
- if "choices" not in data or not data["choices"]:
118
- raise ValueError("LLM API returned no choices.")
119
-
120
- raw_content = data["choices"][0]["message"]["content"]
121
-
122
- # ------------------------------------------------
123
- # Extract JSON safely
124
- # ------------------------------------------------
125
- metadata = extract_json_from_text(raw_content)
126
-
127
- # Ensure required keys exist
128
- required = ["source", "destination", "document_number", "document_date", "title", "body"]
129
- for key in required:
130
- metadata.setdefault(key, "")
131
-
132
- return metadata
133
-
134
- def extract_json_from_text(text):
135
- """
136
- Safely extracts the first JSON object from LLM output.
137
- Handles cases like:
138
- - ```json {...} ```
139
- - text before/after JSON
140
- - invalid trailing content
141
- """
142
- match = re.search(r"\{.*\}", text, flags=re.DOTALL)
143
- if not match:
144
- raise ValueError("No JSON object found in LLM response.")
145
-
146
- json_str = match.group()
147
-
148
- try:
149
- return json.loads(json_str)
150
- except json.JSONDecodeError as exc:
151
- raise ValueError(f"Malformed JSON: {exc}")
152
-
153
- def validate_date(date_str):
154
- """Return a cleaned YYYY-MM-DD or None."""
155
- if not date_str:
156
- return None
157
- try:
158
- timezone.datetime.strptime(date_str, "%Y-%m-%d")
159
- return date_str
160
- except Exception:
161
- return None
162
-
163
- def _find_best_match(input_text, model_name, field_name, cutoff=60):
164
-
165
- if not input_text:
166
- return None
167
-
168
- text = str(input_text).strip()
169
-
170
- app_label = getattr(settings, "OCR_APP_LABEL", "documents")
171
-
172
- try:
173
- model = apps.get_model(app_label, model_name)
174
- except LookupError:
175
- logger.error(f"Model {model_name} not found")
176
- return None
177
-
178
- try:
179
- field = model._meta.get_field(field_name)
180
- except FieldDoesNotExist:
181
- logger.error(f"Field {field_name} not found in model {model_name}")
182
- return None
183
-
184
- related_model = field.related_model
185
- possible_fields = ['source', 'destination']
186
-
187
- for field in possible_fields:
188
- if hasattr(related_model, field):
189
- # Get all values for this field
190
- choices = list(related_model.objects.values_list('id', field))
191
-
192
- if choices:
193
- # Extract just the text values for matching
194
- values = [value for _, value in choices]
195
-
196
- # Use difflib to find closest match
197
- matches = difflib.get_close_matches(text, values, n=1, cutoff=cutoff)
198
-
199
- if matches:
200
- best_match = matches[0]
201
- # Find and return the corresponding ID
202
- for id, value in choices:
203
- if value == best_match:
204
- return id
205
-
206
- return None
207
-
208
- def find_closest_source(raw_text, model_name):
209
- return _find_best_match(raw_text, model_name, "source")
210
-
211
-
212
- def find_closest_destination(raw_text, model_name):
213
- return _find_best_match(raw_text, model_name, "destination")
1
+ # Fundemental imports
2
+ #####################################################################
3
+ from django.utils import timezone
4
+ from django.conf import settings
5
+ from django.apps import apps
6
+ from django.core.exceptions import FieldDoesNotExist
7
+ import difflib
8
+ import os
9
+ import re
10
+ import json
11
+ import requests
12
+ import logging
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ def extract_metadata_with_llm(model_name, ocr_text):
17
+ url = settings.LLM_API_URL
18
+
19
+ system_prompt = """
20
+ You are a strict JSON generator.
21
+
22
+ Analyze Arabic OCR text and extract structured metadata for one document.
23
+
24
+ Respond with EXACTLY one JSON object.
25
+ No explanations, no comments, no markdown, no text outside the JSON.
26
+
27
+ Current document type: {model_name}
28
+
29
+ Output format:
30
+ {
31
+ "title": "",
32
+ "body": "",
33
+
34
+ "document_number": "",
35
+ "og_document_number": "",
36
+
37
+ "document_date": "",
38
+ "og_document_date": "",
39
+
40
+ "source": "",
41
+ "destination": ""
42
+ }
43
+
44
+ Rules:
45
+
46
+ 1. Document types:
47
+ - decree: formal decree, numbered articles, legal references, decree number, subject, footer date.
48
+ - incoming: original letter + received stamp (usually 2 dates, 2 numbers), source and destination, letter body.
49
+ - outgoing / internal / other: usually one date and one number. Source is a ministry department, destination is another entity.
50
+
51
+ 2. Dates:
52
+ - One date → document_date
53
+ - Two dates → older (letter header) = og_document_date, newer (stamp) = document_date
54
+
55
+ 3. Numbers:
56
+ - document_number:
57
+ - outgoing/internal/other → official document number
58
+ - incoming → received stamp number
59
+ - og_document_number:
60
+ - incoming only → original document number
61
+ - Number fields MUST always contain digits only.
62
+ - DO NOT use Eastern Arabic numerals (like ٠١٢٣) or any other numeral system.
63
+ - DO NOT include letters. If OCR included letters with the numbers, REMOVE the letters and keep the numbers.
64
+
65
+ 4. Source & Destination:
66
+ - Extract real entity names only. Clean OCR noise but do NOT invent names.
67
+ - Incoming source: organization in the header.
68
+ - Outgoing source: department under the signer’s name.
69
+ - Destination often appears near the start, commonly after the words sir or sirs and a "/".
70
+
71
+ 5. Title:
72
+ - Extract explicit subject/title if present (will always be in the beggining if present).
73
+ - Otherwise infer a short, accurate phrase from the document text.
74
+
75
+ 6. Body:
76
+ - Return the full cleaned document text with OCR noise removed.
77
+
78
+ Always fill all fields.
79
+ If unknown or not applicable, use an empty string "".
80
+
81
+ Return ONLY the JSON object.
82
+ """
83
+
84
+ user_prompt = f"""
85
+ OCR TEXT:
86
+ \"\"\"
87
+ {ocr_text}
88
+ \"\"\"
89
+ """
90
+
91
+ payload = {
92
+ "model": "/models/" + settings.LLM_MODEL,
93
+ "messages": [
94
+ {"role": "system", "content": system_prompt},
95
+ {"role": "user", "content": user_prompt}
96
+ ],
97
+ "temperature": 0.5,
98
+ "max_tokens": 1024,
99
+ "response_format": {"type": "json_object"},
100
+ "top_p": 0.8,
101
+ "min_p": 0,
102
+ "top_k": 20,
103
+ "presence_penalty": 0.5,
104
+ }
105
+
106
+ # ------------------------------------------------
107
+ # Call the LLM API
108
+ # ------------------------------------------------
109
+ try:
110
+ response = requests.post(url, json=payload, timeout=(10, 1200))
111
+ response.raise_for_status()
112
+ except requests.exceptions.RequestException:
113
+ raise
114
+
115
+ data = response.json()
116
+
117
+ if "choices" not in data or not data["choices"]:
118
+ raise ValueError("LLM API returned no choices.")
119
+
120
+ raw_content = data["choices"][0]["message"]["content"]
121
+
122
+ # ------------------------------------------------
123
+ # Extract JSON safely
124
+ # ------------------------------------------------
125
+ metadata = extract_json_from_text(raw_content)
126
+
127
+
128
+
129
+ # Ensure required keys exist
130
+ required = ["source", "destination", "document_number", "document_date", "title", "body"]
131
+ for key in required:
132
+ metadata.setdefault(key, "")
133
+
134
+ # Sanitize number fields (strict integer enforcement)
135
+ # Even if the AI hallucinates "Ref: 123", we want "123".
136
+ # If "No number", we want "".
137
+ metadata["document_number"] = clean_number_field(metadata["document_number"])
138
+ if "og_document_number" in metadata:
139
+ metadata["og_document_number"] = clean_number_field(metadata["og_document_number"])
140
+
141
+ return metadata
142
+
143
+ def extract_json_from_text(text):
144
+ """
145
+ Safely extracts the first JSON object from LLM output.
146
+ Handles cases like:
147
+ - ```json {...} ```
148
+ - text before/after JSON
149
+ - invalid trailing content
150
+ """
151
+ match = re.search(r"\{.*\}", text, flags=re.DOTALL)
152
+ if not match:
153
+ raise ValueError("No JSON object found in LLM response.")
154
+
155
+ json_str = match.group()
156
+
157
+ try:
158
+ return json.loads(json_str)
159
+ except json.JSONDecodeError as exc:
160
+ raise ValueError(f"Malformed JSON: {exc}")
161
+
162
+ def validate_date(date_str):
163
+ """Return a cleaned YYYY-MM-DD or None."""
164
+ if not date_str:
165
+ return None
166
+ try:
167
+ timezone.datetime.strptime(date_str, "%Y-%m-%d")
168
+ return date_str
169
+ except Exception:
170
+ return None
171
+
172
+ def _find_best_match(input_text, model_name, field_name, cutoff=60):
173
+
174
+ if not input_text:
175
+ return None
176
+
177
+ text = str(input_text).strip()
178
+
179
+ app_label = getattr(settings, "OCR_APP_LABEL", "documents")
180
+
181
+ try:
182
+ model = apps.get_model(app_label, model_name)
183
+ except LookupError:
184
+ logger.error(f"Model {model_name} not found")
185
+ return None
186
+
187
+ try:
188
+ field = model._meta.get_field(field_name)
189
+ except FieldDoesNotExist:
190
+ logger.error(f"Field {field_name} not found in model {model_name}")
191
+ return None
192
+
193
+ related_model = field.related_model
194
+ possible_fields = ['source', 'destination']
195
+
196
+ for field in possible_fields:
197
+ if hasattr(related_model, field):
198
+ # Get all values for this field
199
+ choices = list(related_model.objects.values_list('id', field))
200
+
201
+ if choices:
202
+ # Extract just the text values for matching
203
+ values = [value for _, value in choices]
204
+
205
+ # Use difflib to find closest match
206
+ matches = difflib.get_close_matches(text, values, n=1, cutoff=cutoff)
207
+
208
+ if matches:
209
+ best_match = matches[0]
210
+ # Find and return the corresponding ID
211
+ for id, value in choices:
212
+ if value == best_match:
213
+ return id
214
+
215
+ return None
216
+
217
+ def find_closest_source(raw_text, model_name):
218
+ return _find_best_match(raw_text, model_name, "source")
219
+
220
+
221
+ def find_closest_destination(raw_text, model_name):
222
+ return _find_best_match(raw_text, model_name, "destination")
223
+
224
+ def clean_number_field(value):
225
+ """
226
+ Removes all non-digit characters from the string.
227
+ Also handles common OCR substitutions (letters that look like numbers).
228
+ Returns a string of digits, or empty string if no digits found.
229
+ """
230
+ if not value:
231
+ return ""
232
+
233
+ val_str = str(value)
234
+
235
+ # Common OCR substitutions map
236
+ substitutions = {
237
+ 's': '5', 'S': '5',
238
+ 'z': '7', 'Z': '7',
239
+ 'b': '8', 'B': '8',
240
+ 'l': '1', 'I': '1', 'i': '1',
241
+ 'o': '0', 'O': '0',
242
+ }
243
+
244
+ # Apply substitutions
245
+ for char, digit in substitutions.items():
246
+ val_str = val_str.replace(char, digit)
247
+
248
+ # Filter only digits
249
+ cleaned = "".join(filter(str.isdigit, val_str))
250
+
251
+ return cleaned
archive_ai/apps.py CHANGED
@@ -1,8 +1,8 @@
1
- from django.apps import AppConfig
2
-
3
- class SmartArchiveConfig(AppConfig):
4
- default_auto_field = 'django.db.models.BigAutoField'
5
- name = 'smart_archive'
6
-
7
- def ready(self):
8
- import smart_archive.signals
1
+ from django.apps import AppConfig
2
+
3
+ class ArchiveAiConfig(AppConfig):
4
+ default_auto_field = 'django.db.models.BigAutoField'
5
+ name = 'archive_ai'
6
+
7
+ def ready(self):
8
+ import archive_ai.signals
archive_ai/signals.py CHANGED
@@ -1,29 +1,29 @@
1
- from django.db.models.signals import post_save
2
- from django.dispatch import receiver
3
- from .tasks import send_pdf_to_ocr
4
-
5
- # Defines which models should trigger the OCR.
6
- # Ideally this should be configurable, but for now we follow the original list.
7
- MODELS = [
8
- "Decree",
9
- "Incoming",
10
- "Outgoing",
11
- "Internal",
12
- "Report",
13
- "Other",
14
- ]
15
-
16
- @receiver(post_save)
17
- def trigger_ocr(sender, instance, created, **kwargs):
18
- if not created:
19
- return
20
- if sender.__name__ not in MODELS:
21
- return
22
-
23
- # All your models have a 'pdf' field but if the name differs,
24
- # adjust here.
25
- send_pdf_to_ocr.delay(
26
- model_name=sender.__name__,
27
- obj_id=instance.id,
28
- pdf_field_name="pdf_file" # change if your field name differs
29
- )
1
+ from django.db.models.signals import post_save
2
+ from django.dispatch import receiver
3
+ from .tasks import send_pdf_to_ocr
4
+
5
+ # Defines which models should trigger the OCR.
6
+ # Ideally this should be configurable, but for now we follow the original list.
7
+ MODELS = [
8
+ "Decree",
9
+ "Incoming",
10
+ "Outgoing",
11
+ "Internal",
12
+ "Report",
13
+ "Other",
14
+ ]
15
+
16
+ @receiver(post_save)
17
+ def trigger_ocr(sender, instance, created, **kwargs):
18
+ if not created:
19
+ return
20
+ if sender.__name__ not in MODELS:
21
+ return
22
+
23
+ # All your models have a 'pdf' field but if the name differs,
24
+ # adjust here.
25
+ send_pdf_to_ocr.delay(
26
+ model_name=sender.__name__,
27
+ obj_id=instance.id,
28
+ pdf_field_name="pdf_file" # change if your field name differs
29
+ )
archive_ai/tasks.py CHANGED
@@ -1,111 +1,111 @@
1
- from celery import shared_task
2
- from django.apps import apps
3
- from django.conf import settings
4
- import requests
5
- import os
6
- import logging
7
- from .ai import extract_metadata_with_llm, validate_date, find_closest_source, find_closest_destination
8
-
9
- logger = logging.getLogger(__name__)
10
-
11
- @shared_task(bind=True, max_retries=3, default_retry_delay=120, soft_time_limit=800, time_limit=1200)
12
- def send_pdf_to_ocr(self, model_name, obj_id, pdf_field_name):
13
- """
14
- Uploads a PDF to the OCR API, processes text through LLM metadata extractor,
15
- and updates the related Django model.
16
-
17
- Retries ONLY for network-related issues.
18
- """
19
-
20
- app_label = getattr(settings, "OCR_APP_LABEL", "documents")
21
- Model = apps.get_model(app_label, model_name)
22
-
23
- try:
24
- obj = Model.objects.get(id=obj_id)
25
- except Model.DoesNotExist:
26
- logger.error(f"[OCR TASK] Object {obj_id} of type {model_name} does not exist.")
27
- return False
28
-
29
- pdf_field = getattr(obj, pdf_field_name)
30
- pdf_path = pdf_field.path
31
-
32
- # ------------------------------------------------
33
- # Ensure PDF exists
34
- # ------------------------------------------------
35
- if not os.path.exists(pdf_path):
36
- logger.error(f"[OCR TASK] PDF file missing: {pdf_path}")
37
- return False
38
-
39
- # ------------------------------------------------
40
- # 1. Send to OCR API
41
- # ------------------------------------------------
42
- try:
43
- with open(pdf_path, "rb") as f:
44
- response = requests.post(
45
- settings.OCR_API_URL,
46
- files={"file": f},
47
- data={"rotation": 0, "mode": "Markdown"},
48
- timeout=1200,
49
- )
50
- response.raise_for_status()
51
- except requests.exceptions.RequestException as exc:
52
- # Retry only real network failures
53
- logger.warning(f"[OCR TASK] Network error, retrying: {exc}")
54
- raise self.retry(exc=exc)
55
-
56
- try:
57
- ocr_text = response.json().get("text", "")
58
- except Exception as exc:
59
- logger.error(f"[OCR TASK] Invalid OCR response JSON: {exc}")
60
- return False
61
-
62
- # ------------------------------------------------
63
- # 2. Extract metadata using LLM
64
- # ------------------------------------------------
65
- try:
66
- metadata = extract_metadata_with_llm(model_name, ocr_text)
67
- except requests.exceptions.RequestException as exc:
68
- logger.warning(f"[OCR TASK] LLM network failure, retrying: {exc}")
69
- raise self.retry(exc=exc)
70
- except Exception as exc:
71
- logger.error(f"[OCR TASK] Metadata parsing failed: {exc}")
72
- return False
73
-
74
- # ------------------------------------------------
75
- # 3. Apply metadata safely
76
- # ------------------------------------------------
77
- try:
78
- # Universal fields
79
- obj.title = metadata.get("title") or obj.title
80
- obj.text = metadata.get("body", obj.text)
81
-
82
- # Set number depending on doc type
83
- if metadata.get("doc_type") == "incoming":
84
- obj.number = metadata.get("og_document_number") or obj.number
85
- obj.received_number = metadata.get("document_number") or obj.received_number
86
- else:
87
- obj.number = metadata.get("document_number") or obj.number
88
-
89
- # Dates
90
- doc_date = validate_date(metadata.get("document_date", ""))
91
- og_date = validate_date(metadata.get("og_document_date", ""))
92
-
93
- if metadata.get("doc_type") == "incoming":
94
- if og_date:
95
- obj.og_date = og_date
96
- else:
97
- if doc_date:
98
- obj.date = doc_date
99
-
100
- # Source / Destination logic if needed
101
- # (This part was commented out in original, keeping it that way or enabling based on user needs.
102
- # I'll keep it commented out to match original behavior, but ensure imports are available if they uncomment)
103
-
104
- obj.save()
105
-
106
- except Exception as exc:
107
- logger.error(f"[OCR TASK] Failed to update object {obj_id}: {exc}")
108
- return False
109
-
110
- logger.info(f"[OCR TASK] Completed successfully for obj {obj_id}")
111
- return True
1
+ from celery import shared_task
2
+ from django.apps import apps
3
+ from django.conf import settings
4
+ import requests
5
+ import os
6
+ import logging
7
+ from .ai import extract_metadata_with_llm, validate_date, find_closest_source, find_closest_destination
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ @shared_task(bind=True, max_retries=3, default_retry_delay=120, soft_time_limit=800, time_limit=1200)
12
+ def send_pdf_to_ocr(self, model_name, obj_id, pdf_field_name):
13
+ """
14
+ Uploads a PDF to the OCR API, processes text through LLM metadata extractor,
15
+ and updates the related Django model.
16
+
17
+ Retries ONLY for network-related issues.
18
+ """
19
+
20
+ app_label = getattr(settings, "OCR_APP_LABEL", "documents")
21
+ Model = apps.get_model(app_label, model_name)
22
+
23
+ try:
24
+ obj = Model.objects.get(id=obj_id)
25
+ except Model.DoesNotExist:
26
+ logger.error(f"[OCR TASK] Object {obj_id} of type {model_name} does not exist.")
27
+ return False
28
+
29
+ pdf_field = getattr(obj, pdf_field_name)
30
+ pdf_path = pdf_field.path
31
+
32
+ # ------------------------------------------------
33
+ # Ensure PDF exists
34
+ # ------------------------------------------------
35
+ if not os.path.exists(pdf_path):
36
+ logger.error(f"[OCR TASK] PDF file missing: {pdf_path}")
37
+ return False
38
+
39
+ # ------------------------------------------------
40
+ # 1. Send to OCR API
41
+ # ------------------------------------------------
42
+ try:
43
+ with open(pdf_path, "rb") as f:
44
+ response = requests.post(
45
+ settings.OCR_API_URL,
46
+ files={"file": f},
47
+ data={"rotation": 0, "mode": "Markdown"},
48
+ timeout=1200,
49
+ )
50
+ response.raise_for_status()
51
+ except requests.exceptions.RequestException as exc:
52
+ # Retry only real network failures
53
+ logger.warning(f"[OCR TASK] Network error, retrying: {exc}")
54
+ raise self.retry(exc=exc)
55
+
56
+ try:
57
+ ocr_text = response.json().get("text", "")
58
+ except Exception as exc:
59
+ logger.error(f"[OCR TASK] Invalid OCR response JSON: {exc}")
60
+ return False
61
+
62
+ # ------------------------------------------------
63
+ # 2. Extract metadata using LLM
64
+ # ------------------------------------------------
65
+ try:
66
+ metadata = extract_metadata_with_llm(model_name, ocr_text)
67
+ except requests.exceptions.RequestException as exc:
68
+ logger.warning(f"[OCR TASK] LLM network failure, retrying: {exc}")
69
+ raise self.retry(exc=exc)
70
+ except Exception as exc:
71
+ logger.error(f"[OCR TASK] Metadata parsing failed: {exc}")
72
+ return False
73
+
74
+ # ------------------------------------------------
75
+ # 3. Apply metadata safely
76
+ # ------------------------------------------------
77
+ try:
78
+ # Universal fields
79
+ obj.title = metadata.get("title") or obj.title
80
+ obj.text = metadata.get("body", obj.text)
81
+
82
+ # Set number depending on doc type
83
+ if metadata.get("doc_type") == "incoming":
84
+ obj.number = metadata.get("og_document_number") or obj.number
85
+ obj.received_number = metadata.get("document_number") or obj.received_number
86
+ else:
87
+ obj.number = metadata.get("document_number") or obj.number
88
+
89
+ # Dates
90
+ doc_date = validate_date(metadata.get("document_date", ""))
91
+ og_date = validate_date(metadata.get("og_document_date", ""))
92
+
93
+ if metadata.get("doc_type") == "incoming":
94
+ if og_date:
95
+ obj.og_date = og_date
96
+ else:
97
+ if doc_date:
98
+ obj.date = doc_date
99
+
100
+ # Source / Destination logic if needed
101
+ # (This part was commented out in original, keeping it that way or enabling based on user needs.
102
+ # I'll keep it commented out to match original behavior, but ensure imports are available if they uncomment)
103
+
104
+ obj.save()
105
+
106
+ except Exception as exc:
107
+ logger.error(f"[OCR TASK] Failed to update object {obj_id}: {exc}")
108
+ return False
109
+
110
+ logger.info(f"[OCR TASK] Completed successfully for obj {obj_id}")
111
+ return True
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: archive-ai
3
- Version: 1.0.0
3
+ Version: 1.0.2
4
4
  Summary: A Django app for AI and OCR logic.
5
5
  Home-page: https://github.com/debeski1/archive-ai-app
6
6
  Author: Debeski
@@ -0,0 +1,10 @@
1
+ archive_ai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ archive_ai/ai.py,sha256=P-sY_VhUUT2NrcJMNDSuDBT_fl8BKtKrYR4Hm-ihv7o,7490
3
+ archive_ai/apps.py,sha256=uW444MG4pcCEUZXrPbXnoMx28u9Ng5fpZ9HkNdWbhZk,206
4
+ archive_ai/signals.py,sha256=KqzYkjvw3vS6WqtPgVOEJE_QSxf85sMhAZ25v_YjLCQ,768
5
+ archive_ai/tasks.py,sha256=dCwC6gW_HFaftBw23a7xVlNW9Lioje8suNwzLzdQq4c,4079
6
+ archive_ai-1.0.2.dist-info/licenses/LICENSE,sha256=ESYyLizI0WWtxMeS7rGVcX3ivMezm-HOd5WdeOh-9oU,1056
7
+ archive_ai-1.0.2.dist-info/METADATA,sha256=YqbWTJeWrelAsxni199xeZY7A1RLMydEX7kDPdgavtE,1368
8
+ archive_ai-1.0.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
9
+ archive_ai-1.0.2.dist-info/top_level.txt,sha256=FahgirRqHLDonpCp0VlSBpF7e72exaNU5gYQh1P7sRQ,11
10
+ archive_ai-1.0.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,21 +1,21 @@
1
- MIT License
2
-
3
- Copyright (c) 2026
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
1
+ MIT License
2
+
3
+ Copyright (c) 2026
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,10 +0,0 @@
1
- archive_ai/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- archive_ai/ai.py,sha256=rKcMHnWUm9KEZ9zbPd4QTd2eryX0IdBJSzJGbb9G1BM,6571
3
- archive_ai/apps.py,sha256=_mZyBpnl3lMujJRnnS6PX3G_p_BbEnobgqJMTD5eXHg,223
4
- archive_ai/signals.py,sha256=ERLNttP7h-mTDDsB6T8NEDH8R1qYlZpO8-c6j5NRCKw,797
5
- archive_ai/tasks.py,sha256=TMkr6Jna7e__K34jNWmhlGGZ2MUnoOzzRjvx-_trWt0,4190
6
- archive_ai-1.0.0.dist-info/licenses/LICENSE,sha256=XKKSDU9WlUEAyPNlRhq6e2xhVNpJc097JwPZJ1rUnRE,1077
7
- archive_ai-1.0.0.dist-info/METADATA,sha256=FbFcNmPsxoA0z532PJq1FKKExicy5VPGN8XS1hCusRk,1368
8
- archive_ai-1.0.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
9
- archive_ai-1.0.0.dist-info/top_level.txt,sha256=FahgirRqHLDonpCp0VlSBpF7e72exaNU5gYQh1P7sRQ,11
10
- archive_ai-1.0.0.dist-info/RECORD,,