archive-ai 1.0.0__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {archive_ai-1.0.0 → archive_ai-1.0.2}/LICENSE +21 -21
- {archive_ai-1.0.0 → archive_ai-1.0.2}/MANIFEST.in +4 -4
- {archive_ai-1.0.0/archive_ai.egg-info → archive_ai-1.0.2}/PKG-INFO +1 -1
- {archive_ai-1.0.0 → archive_ai-1.0.2}/README.md +25 -25
- {archive_ai-1.0.0 → archive_ai-1.0.2}/archive_ai/ai.py +251 -213
- {archive_ai-1.0.0 → archive_ai-1.0.2}/archive_ai/apps.py +8 -8
- {archive_ai-1.0.0 → archive_ai-1.0.2}/archive_ai/signals.py +29 -29
- {archive_ai-1.0.0 → archive_ai-1.0.2}/archive_ai/tasks.py +111 -111
- {archive_ai-1.0.0 → archive_ai-1.0.2/archive_ai.egg-info}/PKG-INFO +1 -1
- {archive_ai-1.0.0 → archive_ai-1.0.2}/setup.py +29 -29
- {archive_ai-1.0.0 → archive_ai-1.0.2}/archive_ai/__init__.py +0 -0
- {archive_ai-1.0.0 → archive_ai-1.0.2}/archive_ai.egg-info/SOURCES.txt +0 -0
- {archive_ai-1.0.0 → archive_ai-1.0.2}/archive_ai.egg-info/dependency_links.txt +0 -0
- {archive_ai-1.0.0 → archive_ai-1.0.2}/archive_ai.egg-info/requires.txt +0 -0
- {archive_ai-1.0.0 → archive_ai-1.0.2}/archive_ai.egg-info/top_level.txt +0 -0
- {archive_ai-1.0.0 → archive_ai-1.0.2}/setup.cfg +0 -0
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2026
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
include LICENSE
|
|
2
|
-
include README.md
|
|
3
|
-
recursive-include archive_ai/templates *
|
|
4
|
-
recursive-include archive_ai/static *
|
|
1
|
+
include LICENSE
|
|
2
|
+
include README.md
|
|
3
|
+
recursive-include archive_ai/templates *
|
|
4
|
+
recursive-include archive_ai/static *
|
|
@@ -1,25 +1,25 @@
|
|
|
1
|
-
# Archive AI
|
|
2
|
-
|
|
3
|
-
A Django app that provides AI and OCR capabilities.
|
|
4
|
-
|
|
5
|
-
## Installation
|
|
6
|
-
|
|
7
|
-
1. Install via pip:
|
|
8
|
-
```bash
|
|
9
|
-
pip install archive-ai
|
|
10
|
-
```
|
|
11
|
-
|
|
12
|
-
2. Add to `INSTALLED_APPS`:
|
|
13
|
-
```python
|
|
14
|
-
INSTALLED_APPS = [
|
|
15
|
-
...
|
|
16
|
-
'archive_ai',
|
|
17
|
-
]
|
|
18
|
-
```
|
|
19
|
-
|
|
20
|
-
3. Configure settings:
|
|
21
|
-
```python
|
|
22
|
-
OCR_API_URL = "http://ocr-service:8000/ocr"
|
|
23
|
-
LLM_API_URL = "http://llm-service:8000/v1/chat/completions"
|
|
24
|
-
LLM_MODEL = "your-model-name"
|
|
25
|
-
```
|
|
1
|
+
# Archive AI
|
|
2
|
+
|
|
3
|
+
A Django app that provides AI and OCR capabilities.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
1. Install via pip:
|
|
8
|
+
```bash
|
|
9
|
+
pip install archive-ai
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
2. Add to `INSTALLED_APPS`:
|
|
13
|
+
```python
|
|
14
|
+
INSTALLED_APPS = [
|
|
15
|
+
...
|
|
16
|
+
'archive_ai',
|
|
17
|
+
]
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
3. Configure settings:
|
|
21
|
+
```python
|
|
22
|
+
OCR_API_URL = "http://ocr-service:8000/ocr"
|
|
23
|
+
LLM_API_URL = "http://llm-service:8000/v1/chat/completions"
|
|
24
|
+
LLM_MODEL = "your-model-name"
|
|
25
|
+
```
|
|
@@ -1,213 +1,251 @@
|
|
|
1
|
-
# Fundemental imports
|
|
2
|
-
#####################################################################
|
|
3
|
-
from django.utils import timezone
|
|
4
|
-
from django.conf import settings
|
|
5
|
-
from django.apps import apps
|
|
6
|
-
from django.core.exceptions import FieldDoesNotExist
|
|
7
|
-
import difflib
|
|
8
|
-
import os
|
|
9
|
-
import re
|
|
10
|
-
import json
|
|
11
|
-
import requests
|
|
12
|
-
import logging
|
|
13
|
-
|
|
14
|
-
logger = logging.getLogger(__name__)
|
|
15
|
-
|
|
16
|
-
def extract_metadata_with_llm(model_name, ocr_text):
|
|
17
|
-
url = settings.LLM_API_URL
|
|
18
|
-
|
|
19
|
-
system_prompt = """
|
|
20
|
-
You are a strict JSON generator.
|
|
21
|
-
|
|
22
|
-
Analyze Arabic OCR text and extract structured metadata for one document.
|
|
23
|
-
|
|
24
|
-
Respond with EXACTLY one JSON object.
|
|
25
|
-
No explanations, no comments, no markdown, no text outside the JSON.
|
|
26
|
-
|
|
27
|
-
Current document type: {model_name}
|
|
28
|
-
|
|
29
|
-
Output format:
|
|
30
|
-
{
|
|
31
|
-
"title": "",
|
|
32
|
-
"body": "",
|
|
33
|
-
|
|
34
|
-
"document_number": "",
|
|
35
|
-
"og_document_number": "",
|
|
36
|
-
|
|
37
|
-
"document_date": "",
|
|
38
|
-
"og_document_date": "",
|
|
39
|
-
|
|
40
|
-
"source": "",
|
|
41
|
-
"destination": ""
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
Rules:
|
|
45
|
-
|
|
46
|
-
1. Document types:
|
|
47
|
-
- decree: formal decree, numbered articles, legal references, decree number, subject, footer date.
|
|
48
|
-
- incoming: original letter + received stamp (usually 2 dates, 2 numbers), source and destination, letter body.
|
|
49
|
-
- outgoing / internal / other: usually one date and one number. Source is a ministry department, destination is another entity.
|
|
50
|
-
|
|
51
|
-
2. Dates:
|
|
52
|
-
- One date → document_date
|
|
53
|
-
- Two dates → older (letter header) = og_document_date, newer (stamp) = document_date
|
|
54
|
-
|
|
55
|
-
3. Numbers:
|
|
56
|
-
- document_number:
|
|
57
|
-
- outgoing/internal/other → official document number
|
|
58
|
-
- incoming → received stamp number
|
|
59
|
-
- og_document_number:
|
|
60
|
-
- incoming only → original document number
|
|
61
|
-
- Number fields MUST always contain digits only.
|
|
62
|
-
- DO NOT use Eastern Arabic numerals (like ٠١٢٣) or any other numeral system.
|
|
63
|
-
- DO NOT include letters. If OCR included letters with the numbers, REMOVE the letters and keep the numbers.
|
|
64
|
-
|
|
65
|
-
4. Source & Destination:
|
|
66
|
-
- Extract real entity names only. Clean OCR noise but do NOT invent names.
|
|
67
|
-
- Incoming source: organization in the header.
|
|
68
|
-
- Outgoing source: department under the signer’s name.
|
|
69
|
-
- Destination often appears near the start, commonly after the words sir or sirs and a "/".
|
|
70
|
-
|
|
71
|
-
5. Title:
|
|
72
|
-
- Extract explicit subject/title if present (will always be in the beggining if present).
|
|
73
|
-
- Otherwise infer a short, accurate phrase from the document text.
|
|
74
|
-
|
|
75
|
-
6. Body:
|
|
76
|
-
- Return the full cleaned document text with OCR noise removed.
|
|
77
|
-
|
|
78
|
-
Always fill all fields.
|
|
79
|
-
If unknown or not applicable, use an empty string "".
|
|
80
|
-
|
|
81
|
-
Return ONLY the JSON object.
|
|
82
|
-
"""
|
|
83
|
-
|
|
84
|
-
user_prompt = f"""
|
|
85
|
-
OCR TEXT:
|
|
86
|
-
\"\"\"
|
|
87
|
-
{ocr_text}
|
|
88
|
-
\"\"\"
|
|
89
|
-
"""
|
|
90
|
-
|
|
91
|
-
payload = {
|
|
92
|
-
"model": "/models/" + settings.LLM_MODEL,
|
|
93
|
-
"messages": [
|
|
94
|
-
{"role": "system", "content": system_prompt},
|
|
95
|
-
{"role": "user", "content": user_prompt}
|
|
96
|
-
],
|
|
97
|
-
"temperature": 0.5,
|
|
98
|
-
"max_tokens": 1024,
|
|
99
|
-
"response_format": {"type": "json_object"},
|
|
100
|
-
"top_p": 0.8,
|
|
101
|
-
"min_p": 0,
|
|
102
|
-
"top_k": 20,
|
|
103
|
-
"presence_penalty": 0.5,
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
# ------------------------------------------------
|
|
107
|
-
# Call the LLM API
|
|
108
|
-
# ------------------------------------------------
|
|
109
|
-
try:
|
|
110
|
-
response = requests.post(url, json=payload, timeout=(10, 1200))
|
|
111
|
-
response.raise_for_status()
|
|
112
|
-
except requests.exceptions.RequestException:
|
|
113
|
-
raise
|
|
114
|
-
|
|
115
|
-
data = response.json()
|
|
116
|
-
|
|
117
|
-
if "choices" not in data or not data["choices"]:
|
|
118
|
-
raise ValueError("LLM API returned no choices.")
|
|
119
|
-
|
|
120
|
-
raw_content = data["choices"][0]["message"]["content"]
|
|
121
|
-
|
|
122
|
-
# ------------------------------------------------
|
|
123
|
-
# Extract JSON safely
|
|
124
|
-
# ------------------------------------------------
|
|
125
|
-
metadata = extract_json_from_text(raw_content)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
"""
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
try:
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
1
|
+
# Fundemental imports
|
|
2
|
+
#####################################################################
|
|
3
|
+
from django.utils import timezone
|
|
4
|
+
from django.conf import settings
|
|
5
|
+
from django.apps import apps
|
|
6
|
+
from django.core.exceptions import FieldDoesNotExist
|
|
7
|
+
import difflib
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
import json
|
|
11
|
+
import requests
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
def extract_metadata_with_llm(model_name, ocr_text):
|
|
17
|
+
url = settings.LLM_API_URL
|
|
18
|
+
|
|
19
|
+
system_prompt = """
|
|
20
|
+
You are a strict JSON generator.
|
|
21
|
+
|
|
22
|
+
Analyze Arabic OCR text and extract structured metadata for one document.
|
|
23
|
+
|
|
24
|
+
Respond with EXACTLY one JSON object.
|
|
25
|
+
No explanations, no comments, no markdown, no text outside the JSON.
|
|
26
|
+
|
|
27
|
+
Current document type: {model_name}
|
|
28
|
+
|
|
29
|
+
Output format:
|
|
30
|
+
{
|
|
31
|
+
"title": "",
|
|
32
|
+
"body": "",
|
|
33
|
+
|
|
34
|
+
"document_number": "",
|
|
35
|
+
"og_document_number": "",
|
|
36
|
+
|
|
37
|
+
"document_date": "",
|
|
38
|
+
"og_document_date": "",
|
|
39
|
+
|
|
40
|
+
"source": "",
|
|
41
|
+
"destination": ""
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
Rules:
|
|
45
|
+
|
|
46
|
+
1. Document types:
|
|
47
|
+
- decree: formal decree, numbered articles, legal references, decree number, subject, footer date.
|
|
48
|
+
- incoming: original letter + received stamp (usually 2 dates, 2 numbers), source and destination, letter body.
|
|
49
|
+
- outgoing / internal / other: usually one date and one number. Source is a ministry department, destination is another entity.
|
|
50
|
+
|
|
51
|
+
2. Dates:
|
|
52
|
+
- One date → document_date
|
|
53
|
+
- Two dates → older (letter header) = og_document_date, newer (stamp) = document_date
|
|
54
|
+
|
|
55
|
+
3. Numbers:
|
|
56
|
+
- document_number:
|
|
57
|
+
- outgoing/internal/other → official document number
|
|
58
|
+
- incoming → received stamp number
|
|
59
|
+
- og_document_number:
|
|
60
|
+
- incoming only → original document number
|
|
61
|
+
- Number fields MUST always contain digits only.
|
|
62
|
+
- DO NOT use Eastern Arabic numerals (like ٠١٢٣) or any other numeral system.
|
|
63
|
+
- DO NOT include letters. If OCR included letters with the numbers, REMOVE the letters and keep the numbers.
|
|
64
|
+
|
|
65
|
+
4. Source & Destination:
|
|
66
|
+
- Extract real entity names only. Clean OCR noise but do NOT invent names.
|
|
67
|
+
- Incoming source: organization in the header.
|
|
68
|
+
- Outgoing source: department under the signer’s name.
|
|
69
|
+
- Destination often appears near the start, commonly after the words sir or sirs and a "/".
|
|
70
|
+
|
|
71
|
+
5. Title:
|
|
72
|
+
- Extract explicit subject/title if present (will always be in the beggining if present).
|
|
73
|
+
- Otherwise infer a short, accurate phrase from the document text.
|
|
74
|
+
|
|
75
|
+
6. Body:
|
|
76
|
+
- Return the full cleaned document text with OCR noise removed.
|
|
77
|
+
|
|
78
|
+
Always fill all fields.
|
|
79
|
+
If unknown or not applicable, use an empty string "".
|
|
80
|
+
|
|
81
|
+
Return ONLY the JSON object.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
user_prompt = f"""
|
|
85
|
+
OCR TEXT:
|
|
86
|
+
\"\"\"
|
|
87
|
+
{ocr_text}
|
|
88
|
+
\"\"\"
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
payload = {
|
|
92
|
+
"model": "/models/" + settings.LLM_MODEL,
|
|
93
|
+
"messages": [
|
|
94
|
+
{"role": "system", "content": system_prompt},
|
|
95
|
+
{"role": "user", "content": user_prompt}
|
|
96
|
+
],
|
|
97
|
+
"temperature": 0.5,
|
|
98
|
+
"max_tokens": 1024,
|
|
99
|
+
"response_format": {"type": "json_object"},
|
|
100
|
+
"top_p": 0.8,
|
|
101
|
+
"min_p": 0,
|
|
102
|
+
"top_k": 20,
|
|
103
|
+
"presence_penalty": 0.5,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
# ------------------------------------------------
|
|
107
|
+
# Call the LLM API
|
|
108
|
+
# ------------------------------------------------
|
|
109
|
+
try:
|
|
110
|
+
response = requests.post(url, json=payload, timeout=(10, 1200))
|
|
111
|
+
response.raise_for_status()
|
|
112
|
+
except requests.exceptions.RequestException:
|
|
113
|
+
raise
|
|
114
|
+
|
|
115
|
+
data = response.json()
|
|
116
|
+
|
|
117
|
+
if "choices" not in data or not data["choices"]:
|
|
118
|
+
raise ValueError("LLM API returned no choices.")
|
|
119
|
+
|
|
120
|
+
raw_content = data["choices"][0]["message"]["content"]
|
|
121
|
+
|
|
122
|
+
# ------------------------------------------------
|
|
123
|
+
# Extract JSON safely
|
|
124
|
+
# ------------------------------------------------
|
|
125
|
+
metadata = extract_json_from_text(raw_content)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# Ensure required keys exist
|
|
130
|
+
required = ["source", "destination", "document_number", "document_date", "title", "body"]
|
|
131
|
+
for key in required:
|
|
132
|
+
metadata.setdefault(key, "")
|
|
133
|
+
|
|
134
|
+
# Sanitize number fields (strict integer enforcement)
|
|
135
|
+
# Even if the AI hallucinates "Ref: 123", we want "123".
|
|
136
|
+
# If "No number", we want "".
|
|
137
|
+
metadata["document_number"] = clean_number_field(metadata["document_number"])
|
|
138
|
+
if "og_document_number" in metadata:
|
|
139
|
+
metadata["og_document_number"] = clean_number_field(metadata["og_document_number"])
|
|
140
|
+
|
|
141
|
+
return metadata
|
|
142
|
+
|
|
143
|
+
def extract_json_from_text(text):
|
|
144
|
+
"""
|
|
145
|
+
Safely extracts the first JSON object from LLM output.
|
|
146
|
+
Handles cases like:
|
|
147
|
+
- ```json {...} ```
|
|
148
|
+
- text before/after JSON
|
|
149
|
+
- invalid trailing content
|
|
150
|
+
"""
|
|
151
|
+
match = re.search(r"\{.*\}", text, flags=re.DOTALL)
|
|
152
|
+
if not match:
|
|
153
|
+
raise ValueError("No JSON object found in LLM response.")
|
|
154
|
+
|
|
155
|
+
json_str = match.group()
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
return json.loads(json_str)
|
|
159
|
+
except json.JSONDecodeError as exc:
|
|
160
|
+
raise ValueError(f"Malformed JSON: {exc}")
|
|
161
|
+
|
|
162
|
+
def validate_date(date_str):
|
|
163
|
+
"""Return a cleaned YYYY-MM-DD or None."""
|
|
164
|
+
if not date_str:
|
|
165
|
+
return None
|
|
166
|
+
try:
|
|
167
|
+
timezone.datetime.strptime(date_str, "%Y-%m-%d")
|
|
168
|
+
return date_str
|
|
169
|
+
except Exception:
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
def _find_best_match(input_text, model_name, field_name, cutoff=60):
|
|
173
|
+
|
|
174
|
+
if not input_text:
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
text = str(input_text).strip()
|
|
178
|
+
|
|
179
|
+
app_label = getattr(settings, "OCR_APP_LABEL", "documents")
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
model = apps.get_model(app_label, model_name)
|
|
183
|
+
except LookupError:
|
|
184
|
+
logger.error(f"Model {model_name} not found")
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
field = model._meta.get_field(field_name)
|
|
189
|
+
except FieldDoesNotExist:
|
|
190
|
+
logger.error(f"Field {field_name} not found in model {model_name}")
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
related_model = field.related_model
|
|
194
|
+
possible_fields = ['source', 'destination']
|
|
195
|
+
|
|
196
|
+
for field in possible_fields:
|
|
197
|
+
if hasattr(related_model, field):
|
|
198
|
+
# Get all values for this field
|
|
199
|
+
choices = list(related_model.objects.values_list('id', field))
|
|
200
|
+
|
|
201
|
+
if choices:
|
|
202
|
+
# Extract just the text values for matching
|
|
203
|
+
values = [value for _, value in choices]
|
|
204
|
+
|
|
205
|
+
# Use difflib to find closest match
|
|
206
|
+
matches = difflib.get_close_matches(text, values, n=1, cutoff=cutoff)
|
|
207
|
+
|
|
208
|
+
if matches:
|
|
209
|
+
best_match = matches[0]
|
|
210
|
+
# Find and return the corresponding ID
|
|
211
|
+
for id, value in choices:
|
|
212
|
+
if value == best_match:
|
|
213
|
+
return id
|
|
214
|
+
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
def find_closest_source(raw_text, model_name):
|
|
218
|
+
return _find_best_match(raw_text, model_name, "source")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def find_closest_destination(raw_text, model_name):
|
|
222
|
+
return _find_best_match(raw_text, model_name, "destination")
|
|
223
|
+
|
|
224
|
+
def clean_number_field(value):
|
|
225
|
+
"""
|
|
226
|
+
Removes all non-digit characters from the string.
|
|
227
|
+
Also handles common OCR substitutions (letters that look like numbers).
|
|
228
|
+
Returns a string of digits, or empty string if no digits found.
|
|
229
|
+
"""
|
|
230
|
+
if not value:
|
|
231
|
+
return ""
|
|
232
|
+
|
|
233
|
+
val_str = str(value)
|
|
234
|
+
|
|
235
|
+
# Common OCR substitutions map
|
|
236
|
+
substitutions = {
|
|
237
|
+
's': '5', 'S': '5',
|
|
238
|
+
'z': '7', 'Z': '7',
|
|
239
|
+
'b': '8', 'B': '8',
|
|
240
|
+
'l': '1', 'I': '1', 'i': '1',
|
|
241
|
+
'o': '0', 'O': '0',
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
# Apply substitutions
|
|
245
|
+
for char, digit in substitutions.items():
|
|
246
|
+
val_str = val_str.replace(char, digit)
|
|
247
|
+
|
|
248
|
+
# Filter only digits
|
|
249
|
+
cleaned = "".join(filter(str.isdigit, val_str))
|
|
250
|
+
|
|
251
|
+
return cleaned
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
from django.apps import AppConfig
|
|
2
|
-
|
|
3
|
-
class
|
|
4
|
-
default_auto_field = 'django.db.models.BigAutoField'
|
|
5
|
-
name = '
|
|
6
|
-
|
|
7
|
-
def ready(self):
|
|
8
|
-
import
|
|
1
|
+
from django.apps import AppConfig
|
|
2
|
+
|
|
3
|
+
class ArchiveAiConfig(AppConfig):
|
|
4
|
+
default_auto_field = 'django.db.models.BigAutoField'
|
|
5
|
+
name = 'archive_ai'
|
|
6
|
+
|
|
7
|
+
def ready(self):
|
|
8
|
+
import archive_ai.signals
|
|
@@ -1,29 +1,29 @@
|
|
|
1
|
-
from django.db.models.signals import post_save
|
|
2
|
-
from django.dispatch import receiver
|
|
3
|
-
from .tasks import send_pdf_to_ocr
|
|
4
|
-
|
|
5
|
-
# Defines which models should trigger the OCR.
|
|
6
|
-
# Ideally this should be configurable, but for now we follow the original list.
|
|
7
|
-
MODELS = [
|
|
8
|
-
"Decree",
|
|
9
|
-
"Incoming",
|
|
10
|
-
"Outgoing",
|
|
11
|
-
"Internal",
|
|
12
|
-
"Report",
|
|
13
|
-
"Other",
|
|
14
|
-
]
|
|
15
|
-
|
|
16
|
-
@receiver(post_save)
|
|
17
|
-
def trigger_ocr(sender, instance, created, **kwargs):
|
|
18
|
-
if not created:
|
|
19
|
-
return
|
|
20
|
-
if sender.__name__ not in MODELS:
|
|
21
|
-
return
|
|
22
|
-
|
|
23
|
-
# All your models have a 'pdf' field but if the name differs,
|
|
24
|
-
# adjust here.
|
|
25
|
-
send_pdf_to_ocr.delay(
|
|
26
|
-
model_name=sender.__name__,
|
|
27
|
-
obj_id=instance.id,
|
|
28
|
-
pdf_field_name="pdf_file" # change if your field name differs
|
|
29
|
-
)
|
|
1
|
+
from django.db.models.signals import post_save
|
|
2
|
+
from django.dispatch import receiver
|
|
3
|
+
from .tasks import send_pdf_to_ocr
|
|
4
|
+
|
|
5
|
+
# Defines which models should trigger the OCR.
|
|
6
|
+
# Ideally this should be configurable, but for now we follow the original list.
|
|
7
|
+
MODELS = [
|
|
8
|
+
"Decree",
|
|
9
|
+
"Incoming",
|
|
10
|
+
"Outgoing",
|
|
11
|
+
"Internal",
|
|
12
|
+
"Report",
|
|
13
|
+
"Other",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
@receiver(post_save)
|
|
17
|
+
def trigger_ocr(sender, instance, created, **kwargs):
|
|
18
|
+
if not created:
|
|
19
|
+
return
|
|
20
|
+
if sender.__name__ not in MODELS:
|
|
21
|
+
return
|
|
22
|
+
|
|
23
|
+
# All your models have a 'pdf' field but if the name differs,
|
|
24
|
+
# adjust here.
|
|
25
|
+
send_pdf_to_ocr.delay(
|
|
26
|
+
model_name=sender.__name__,
|
|
27
|
+
obj_id=instance.id,
|
|
28
|
+
pdf_field_name="pdf_file" # change if your field name differs
|
|
29
|
+
)
|
|
@@ -1,111 +1,111 @@
|
|
|
1
|
-
from celery import shared_task
|
|
2
|
-
from django.apps import apps
|
|
3
|
-
from django.conf import settings
|
|
4
|
-
import requests
|
|
5
|
-
import os
|
|
6
|
-
import logging
|
|
7
|
-
from .ai import extract_metadata_with_llm, validate_date, find_closest_source, find_closest_destination
|
|
8
|
-
|
|
9
|
-
logger = logging.getLogger(__name__)
|
|
10
|
-
|
|
11
|
-
@shared_task(bind=True, max_retries=3, default_retry_delay=120, soft_time_limit=800, time_limit=1200)
|
|
12
|
-
def send_pdf_to_ocr(self, model_name, obj_id, pdf_field_name):
|
|
13
|
-
"""
|
|
14
|
-
Uploads a PDF to the OCR API, processes text through LLM metadata extractor,
|
|
15
|
-
and updates the related Django model.
|
|
16
|
-
|
|
17
|
-
Retries ONLY for network-related issues.
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
app_label = getattr(settings, "OCR_APP_LABEL", "documents")
|
|
21
|
-
Model = apps.get_model(app_label, model_name)
|
|
22
|
-
|
|
23
|
-
try:
|
|
24
|
-
obj = Model.objects.get(id=obj_id)
|
|
25
|
-
except Model.DoesNotExist:
|
|
26
|
-
logger.error(f"[OCR TASK] Object {obj_id} of type {model_name} does not exist.")
|
|
27
|
-
return False
|
|
28
|
-
|
|
29
|
-
pdf_field = getattr(obj, pdf_field_name)
|
|
30
|
-
pdf_path = pdf_field.path
|
|
31
|
-
|
|
32
|
-
# ------------------------------------------------
|
|
33
|
-
# Ensure PDF exists
|
|
34
|
-
# ------------------------------------------------
|
|
35
|
-
if not os.path.exists(pdf_path):
|
|
36
|
-
logger.error(f"[OCR TASK] PDF file missing: {pdf_path}")
|
|
37
|
-
return False
|
|
38
|
-
|
|
39
|
-
# ------------------------------------------------
|
|
40
|
-
# 1. Send to OCR API
|
|
41
|
-
# ------------------------------------------------
|
|
42
|
-
try:
|
|
43
|
-
with open(pdf_path, "rb") as f:
|
|
44
|
-
response = requests.post(
|
|
45
|
-
settings.OCR_API_URL,
|
|
46
|
-
files={"file": f},
|
|
47
|
-
data={"rotation": 0, "mode": "Markdown"},
|
|
48
|
-
timeout=1200,
|
|
49
|
-
)
|
|
50
|
-
response.raise_for_status()
|
|
51
|
-
except requests.exceptions.RequestException as exc:
|
|
52
|
-
# Retry only real network failures
|
|
53
|
-
logger.warning(f"[OCR TASK] Network error, retrying: {exc}")
|
|
54
|
-
raise self.retry(exc=exc)
|
|
55
|
-
|
|
56
|
-
try:
|
|
57
|
-
ocr_text = response.json().get("text", "")
|
|
58
|
-
except Exception as exc:
|
|
59
|
-
logger.error(f"[OCR TASK] Invalid OCR response JSON: {exc}")
|
|
60
|
-
return False
|
|
61
|
-
|
|
62
|
-
# ------------------------------------------------
|
|
63
|
-
# 2. Extract metadata using LLM
|
|
64
|
-
# ------------------------------------------------
|
|
65
|
-
try:
|
|
66
|
-
metadata = extract_metadata_with_llm(model_name, ocr_text)
|
|
67
|
-
except requests.exceptions.RequestException as exc:
|
|
68
|
-
logger.warning(f"[OCR TASK] LLM network failure, retrying: {exc}")
|
|
69
|
-
raise self.retry(exc=exc)
|
|
70
|
-
except Exception as exc:
|
|
71
|
-
logger.error(f"[OCR TASK] Metadata parsing failed: {exc}")
|
|
72
|
-
return False
|
|
73
|
-
|
|
74
|
-
# ------------------------------------------------
|
|
75
|
-
# 3. Apply metadata safely
|
|
76
|
-
# ------------------------------------------------
|
|
77
|
-
try:
|
|
78
|
-
# Universal fields
|
|
79
|
-
obj.title = metadata.get("title") or obj.title
|
|
80
|
-
obj.text = metadata.get("body", obj.text)
|
|
81
|
-
|
|
82
|
-
# Set number depending on doc type
|
|
83
|
-
if metadata.get("doc_type") == "incoming":
|
|
84
|
-
obj.number = metadata.get("og_document_number") or obj.number
|
|
85
|
-
obj.received_number = metadata.get("document_number") or obj.received_number
|
|
86
|
-
else:
|
|
87
|
-
obj.number = metadata.get("document_number") or obj.number
|
|
88
|
-
|
|
89
|
-
# Dates
|
|
90
|
-
doc_date = validate_date(metadata.get("document_date", ""))
|
|
91
|
-
og_date = validate_date(metadata.get("og_document_date", ""))
|
|
92
|
-
|
|
93
|
-
if metadata.get("doc_type") == "incoming":
|
|
94
|
-
if og_date:
|
|
95
|
-
obj.og_date = og_date
|
|
96
|
-
else:
|
|
97
|
-
if doc_date:
|
|
98
|
-
obj.date = doc_date
|
|
99
|
-
|
|
100
|
-
# Source / Destination logic if needed
|
|
101
|
-
# (This part was commented out in original, keeping it that way or enabling based on user needs.
|
|
102
|
-
# I'll keep it commented out to match original behavior, but ensure imports are available if they uncomment)
|
|
103
|
-
|
|
104
|
-
obj.save()
|
|
105
|
-
|
|
106
|
-
except Exception as exc:
|
|
107
|
-
logger.error(f"[OCR TASK] Failed to update object {obj_id}: {exc}")
|
|
108
|
-
return False
|
|
109
|
-
|
|
110
|
-
logger.info(f"[OCR TASK] Completed successfully for obj {obj_id}")
|
|
111
|
-
return True
|
|
1
|
+
from celery import shared_task
|
|
2
|
+
from django.apps import apps
|
|
3
|
+
from django.conf import settings
|
|
4
|
+
import requests
|
|
5
|
+
import os
|
|
6
|
+
import logging
|
|
7
|
+
from .ai import extract_metadata_with_llm, validate_date, find_closest_source, find_closest_destination
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
@shared_task(bind=True, max_retries=3, default_retry_delay=120, soft_time_limit=800, time_limit=1200)
|
|
12
|
+
def send_pdf_to_ocr(self, model_name, obj_id, pdf_field_name):
|
|
13
|
+
"""
|
|
14
|
+
Uploads a PDF to the OCR API, processes text through LLM metadata extractor,
|
|
15
|
+
and updates the related Django model.
|
|
16
|
+
|
|
17
|
+
Retries ONLY for network-related issues.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
app_label = getattr(settings, "OCR_APP_LABEL", "documents")
|
|
21
|
+
Model = apps.get_model(app_label, model_name)
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
obj = Model.objects.get(id=obj_id)
|
|
25
|
+
except Model.DoesNotExist:
|
|
26
|
+
logger.error(f"[OCR TASK] Object {obj_id} of type {model_name} does not exist.")
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
pdf_field = getattr(obj, pdf_field_name)
|
|
30
|
+
pdf_path = pdf_field.path
|
|
31
|
+
|
|
32
|
+
# ------------------------------------------------
|
|
33
|
+
# Ensure PDF exists
|
|
34
|
+
# ------------------------------------------------
|
|
35
|
+
if not os.path.exists(pdf_path):
|
|
36
|
+
logger.error(f"[OCR TASK] PDF file missing: {pdf_path}")
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
# ------------------------------------------------
|
|
40
|
+
# 1. Send to OCR API
|
|
41
|
+
# ------------------------------------------------
|
|
42
|
+
try:
|
|
43
|
+
with open(pdf_path, "rb") as f:
|
|
44
|
+
response = requests.post(
|
|
45
|
+
settings.OCR_API_URL,
|
|
46
|
+
files={"file": f},
|
|
47
|
+
data={"rotation": 0, "mode": "Markdown"},
|
|
48
|
+
timeout=1200,
|
|
49
|
+
)
|
|
50
|
+
response.raise_for_status()
|
|
51
|
+
except requests.exceptions.RequestException as exc:
|
|
52
|
+
# Retry only real network failures
|
|
53
|
+
logger.warning(f"[OCR TASK] Network error, retrying: {exc}")
|
|
54
|
+
raise self.retry(exc=exc)
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
ocr_text = response.json().get("text", "")
|
|
58
|
+
except Exception as exc:
|
|
59
|
+
logger.error(f"[OCR TASK] Invalid OCR response JSON: {exc}")
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
# ------------------------------------------------
|
|
63
|
+
# 2. Extract metadata using LLM
|
|
64
|
+
# ------------------------------------------------
|
|
65
|
+
try:
|
|
66
|
+
metadata = extract_metadata_with_llm(model_name, ocr_text)
|
|
67
|
+
except requests.exceptions.RequestException as exc:
|
|
68
|
+
logger.warning(f"[OCR TASK] LLM network failure, retrying: {exc}")
|
|
69
|
+
raise self.retry(exc=exc)
|
|
70
|
+
except Exception as exc:
|
|
71
|
+
logger.error(f"[OCR TASK] Metadata parsing failed: {exc}")
|
|
72
|
+
return False
|
|
73
|
+
|
|
74
|
+
# ------------------------------------------------
|
|
75
|
+
# 3. Apply metadata safely
|
|
76
|
+
# ------------------------------------------------
|
|
77
|
+
try:
|
|
78
|
+
# Universal fields
|
|
79
|
+
obj.title = metadata.get("title") or obj.title
|
|
80
|
+
obj.text = metadata.get("body", obj.text)
|
|
81
|
+
|
|
82
|
+
# Set number depending on doc type
|
|
83
|
+
if metadata.get("doc_type") == "incoming":
|
|
84
|
+
obj.number = metadata.get("og_document_number") or obj.number
|
|
85
|
+
obj.received_number = metadata.get("document_number") or obj.received_number
|
|
86
|
+
else:
|
|
87
|
+
obj.number = metadata.get("document_number") or obj.number
|
|
88
|
+
|
|
89
|
+
# Dates
|
|
90
|
+
doc_date = validate_date(metadata.get("document_date", ""))
|
|
91
|
+
og_date = validate_date(metadata.get("og_document_date", ""))
|
|
92
|
+
|
|
93
|
+
if metadata.get("doc_type") == "incoming":
|
|
94
|
+
if og_date:
|
|
95
|
+
obj.og_date = og_date
|
|
96
|
+
else:
|
|
97
|
+
if doc_date:
|
|
98
|
+
obj.date = doc_date
|
|
99
|
+
|
|
100
|
+
# Source / Destination logic if needed
|
|
101
|
+
# (This part was commented out in original, keeping it that way or enabling based on user needs.
|
|
102
|
+
# I'll keep it commented out to match original behavior, but ensure imports are available if they uncomment)
|
|
103
|
+
|
|
104
|
+
obj.save()
|
|
105
|
+
|
|
106
|
+
except Exception as exc:
|
|
107
|
+
logger.error(f"[OCR TASK] Failed to update object {obj_id}: {exc}")
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
logger.info(f"[OCR TASK] Completed successfully for obj {obj_id}")
|
|
111
|
+
return True
|
|
@@ -1,29 +1,29 @@
|
|
|
1
|
-
from setuptools import setup, find_packages
|
|
2
|
-
|
|
3
|
-
setup(
|
|
4
|
-
name="archive-ai",
|
|
5
|
-
version="1.0.
|
|
6
|
-
packages=find_packages(),
|
|
7
|
-
include_package_data=True,
|
|
8
|
-
license="MIT",
|
|
9
|
-
description="A Django app for AI and OCR logic.",
|
|
10
|
-
long_description=open("README.md").read(),
|
|
11
|
-
long_description_content_type="text/markdown",
|
|
12
|
-
url="https://github.com/debeski1/archive-ai-app",
|
|
13
|
-
author="Debeski",
|
|
14
|
-
author_email="debeski1@gmail.com",
|
|
15
|
-
classifiers=[
|
|
16
|
-
"Environment :: Web Environment",
|
|
17
|
-
"Framework :: Django",
|
|
18
|
-
"Intended Audience :: Developers",
|
|
19
|
-
"License :: OSI Approved :: MIT License",
|
|
20
|
-
"Operating System :: OS Independent",
|
|
21
|
-
"Programming Language :: Python",
|
|
22
|
-
"Programming Language :: Python :: 3",
|
|
23
|
-
],
|
|
24
|
-
install_requires=[
|
|
25
|
-
"Django>=3.2",
|
|
26
|
-
"requests",
|
|
27
|
-
"celery",
|
|
28
|
-
],
|
|
29
|
-
)
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="archive-ai",
|
|
5
|
+
version="1.0.2",
|
|
6
|
+
packages=find_packages(),
|
|
7
|
+
include_package_data=True,
|
|
8
|
+
license="MIT",
|
|
9
|
+
description="A Django app for AI and OCR logic.",
|
|
10
|
+
long_description=open("README.md").read(),
|
|
11
|
+
long_description_content_type="text/markdown",
|
|
12
|
+
url="https://github.com/debeski1/archive-ai-app",
|
|
13
|
+
author="Debeski",
|
|
14
|
+
author_email="debeski1@gmail.com",
|
|
15
|
+
classifiers=[
|
|
16
|
+
"Environment :: Web Environment",
|
|
17
|
+
"Framework :: Django",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Programming Language :: Python",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
],
|
|
24
|
+
install_requires=[
|
|
25
|
+
"Django>=3.2",
|
|
26
|
+
"requests",
|
|
27
|
+
"celery",
|
|
28
|
+
],
|
|
29
|
+
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|