readgrid 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- readgrid/__init__.py +17 -0
- readgrid/pipeline.py +877 -0
- readgrid-0.1.0.dist-info/METADATA +113 -0
- readgrid-0.1.0.dist-info/RECORD +7 -0
- readgrid-0.1.0.dist-info/WHEEL +5 -0
- readgrid-0.1.0.dist-info/licenses/LICENSE +21 -0
- readgrid-0.1.0.dist-info/top_level.txt +1 -0
readgrid/__init__.py
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
from .pipeline import (
|
2
|
+
stage_1,
|
3
|
+
stage_2,
|
4
|
+
stage_3,
|
5
|
+
cleanup_pipeline,
|
6
|
+
pretty_print_page_with_image,
|
7
|
+
show_comparison_view
|
8
|
+
)
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"stage_1",
|
12
|
+
"stage_2",
|
13
|
+
"stage_3",
|
14
|
+
"cleanup_pipeline",
|
15
|
+
"pretty_print_page_with_image",
|
16
|
+
"show_comparison_view",
|
17
|
+
]
|
readgrid/pipeline.py
ADDED
@@ -0,0 +1,877 @@
|
|
1
|
+
# ==================== IMPORTS ====================
|
2
|
+
import cv2
|
3
|
+
import numpy as np
|
4
|
+
import json
|
5
|
+
import os
|
6
|
+
import re
|
7
|
+
import base64
|
8
|
+
import time
|
9
|
+
import shutil
|
10
|
+
import textwrap
|
11
|
+
from io import BytesIO
|
12
|
+
from PIL import Image
|
13
|
+
from getpass import getpass
|
14
|
+
from typing import List, Tuple, Dict, Any, Optional
|
15
|
+
|
16
|
+
# Imports for Google Colab
|
17
|
+
from google.colab import files
|
18
|
+
from google.colab.patches import cv2_imshow
|
19
|
+
from google.colab import output
|
20
|
+
from IPython.display import display, Image as IPImage, clear_output, HTML
|
21
|
+
|
22
|
+
# Imports for Stage 3 (LLM)
|
23
|
+
try:
|
24
|
+
import google.generativeai as genai
|
25
|
+
except ImportError:
|
26
|
+
print("Warning: 'google-generativeai' not found. Stage 3 will not be available.")
|
27
|
+
print("Please run: !pip install -q google-generativeai")
|
28
|
+
|
29
|
+
# ==================== UTILITY FUNCTIONS ====================
|
30
|
+
def cleanup_pipeline():
|
31
|
+
"""Removes all generated files and folders from the pipeline."""
|
32
|
+
print("๐งน Cleaning up pipeline artifacts...")
|
33
|
+
items_to_remove = [
|
34
|
+
'uploads',
|
35
|
+
'bounded_images',
|
36
|
+
'final_outputs',
|
37
|
+
'coords.json'
|
38
|
+
]
|
39
|
+
for item in items_to_remove:
|
40
|
+
try:
|
41
|
+
if os.path.exists(item):
|
42
|
+
if os.path.isdir(item):
|
43
|
+
shutil.rmtree(item)
|
44
|
+
print(f" - Removed directory: {item}/")
|
45
|
+
else:
|
46
|
+
os.remove(item)
|
47
|
+
print(f" - Removed file: {item}")
|
48
|
+
except Exception as e:
|
49
|
+
print(f" - Error removing {item}: {e}")
|
50
|
+
print("โ
Cleanup complete.")
|
51
|
+
|
52
|
+
def pretty_print_page_with_image(json_path: str):
|
53
|
+
"""
|
54
|
+
Pretty prints the content of a final JSON file and displays its
|
55
|
+
corresponding annotated image.
|
56
|
+
"""
|
57
|
+
try:
|
58
|
+
with open(json_path, 'r', encoding='utf-8') as f:
|
59
|
+
data = json.load(f)
|
60
|
+
except FileNotFoundError:
|
61
|
+
print(f"โ Error: File '{json_path}' not found.")
|
62
|
+
return
|
63
|
+
|
64
|
+
row_id = os.path.splitext(os.path.basename(json_path))[0]
|
65
|
+
print("=" * 100)
|
66
|
+
print(f"๐ DOCUMENT PREVIEW: {row_id}")
|
67
|
+
print("=" * 100)
|
68
|
+
|
69
|
+
header = data.get("Page header", "") or "(none)"
|
70
|
+
page_text = data.get("Page text", "") or "(none)"
|
71
|
+
footer = data.get("Page footer", "") or "(none)"
|
72
|
+
|
73
|
+
print(f"๐ HEADER:\n---\n{textwrap.fill(header, 100)}\n")
|
74
|
+
print(f"๐ PAGE TEXT:\n---\n{textwrap.fill(page_text, 100)}")
|
75
|
+
print(f"\n๐ FOOTER:\n---\n{textwrap.fill(footer, 100)}\n")
|
76
|
+
|
77
|
+
table_bbox = data.get("table_bbox", [])
|
78
|
+
image_bbox = data.get("image_bbox", [])
|
79
|
+
|
80
|
+
print("๐ฅ TABLE BBOX ([ymin, xmin, ymax, xmax]):")
|
81
|
+
print("---" if table_bbox else "(none)")
|
82
|
+
if table_bbox:
|
83
|
+
for i, bbox in enumerate(table_bbox, 1): print(f" Table {i}: {bbox}")
|
84
|
+
|
85
|
+
print("\n๐ฉ IMAGE BBOX ([ymin, xmin, ymax, xmax]):")
|
86
|
+
print("---" if image_bbox else "(none)")
|
87
|
+
if image_bbox:
|
88
|
+
for i, bbox in enumerate(image_bbox, 1): print(f" Image {i}: {bbox}")
|
89
|
+
|
90
|
+
img_path = os.path.join('bounded_images', f"{row_id}.jpg")
|
91
|
+
if os.path.exists(img_path):
|
92
|
+
print(f"\n๐ธ CORRESPONDING ANNOTATED IMAGE:")
|
93
|
+
cv2_imshow(cv2.imread(img_path))
|
94
|
+
else:
|
95
|
+
print(f"\nโ ๏ธ Annotated image not found at: {img_path}")
|
96
|
+
print("=" * 100)
|
97
|
+
|
98
|
+
def show_comparison_view(json_path: str):
|
99
|
+
"""
|
100
|
+
Renders a side-by-side HTML view of the original page image and the
|
101
|
+
reconstructed page content from its final JSON file.
|
102
|
+
"""
|
103
|
+
try:
|
104
|
+
with open(json_path, 'r', encoding='utf-8') as f:
|
105
|
+
data = json.load(f)
|
106
|
+
except FileNotFoundError:
|
107
|
+
print(f"โ Error: File '{json_path}' not found.")
|
108
|
+
return
|
109
|
+
|
110
|
+
row_id = os.path.splitext(os.path.basename(json_path))[0]
|
111
|
+
img_path = os.path.join('bounded_images', f"{row_id}.jpg")
|
112
|
+
|
113
|
+
if not os.path.exists(img_path):
|
114
|
+
print(f"โ Error: Image file not found at '{img_path}'")
|
115
|
+
return
|
116
|
+
|
117
|
+
image = cv2.imread(img_path)
|
118
|
+
_, buffer = cv2.imencode('.jpg', image)
|
119
|
+
base64_image = base64.b64encode(buffer).decode('utf-8')
|
120
|
+
|
121
|
+
header = data.get("Page header", "")
|
122
|
+
page_text = data.get("Page text", "").replace('\n', '<br>')
|
123
|
+
footer = data.get("Page footer", "")
|
124
|
+
|
125
|
+
html_content = f"""
|
126
|
+
<div style="display: flex; gap: 20px; font-family: sans-serif;">
|
127
|
+
<div style="flex: 1; border: 1px solid #ddd; padding: 10px;">
|
128
|
+
<h3 style="text-align: center;">Annotated Page Image</h3>
|
129
|
+
<img src="data:image/jpeg;base64,{base64_image}" style="width: 100%;">
|
130
|
+
</div>
|
131
|
+
<div style="flex: 1; border: 1px solid #ddd; padding: 10px;">
|
132
|
+
<h3 style="text-align: center;">Reconstructed Page Preview</h3>
|
133
|
+
<div style="background: #f5f5f5; padding: 10px; margin-bottom: 10px; border-radius: 4px;"><b>Header:</b> {header}</div>
|
134
|
+
<div style="line-height: 1.6;">{page_text}</div>
|
135
|
+
<div style="background: #f5f5f5; padding: 10px; margin-top: 10px; border-radius: 4px; font-size: 0.9em;"><b>Footer:</b> {footer}</div>
|
136
|
+
</div>
|
137
|
+
</div>
|
138
|
+
"""
|
139
|
+
display(HTML(html_content))
|
140
|
+
|
141
|
+
# ==================== HELPER & EDITOR FUNCTIONS ====================
|
142
|
+
|
143
|
+
def xywh_to_yminmax(box: tuple) -> List[int]:
|
144
|
+
"""Converts (x, y, w, h) to [ymin, xmin, ymax, xmax]."""
|
145
|
+
x, y, w, h = box
|
146
|
+
return [y, x, y + h, x + w]
|
147
|
+
|
148
|
+
def yminmax_to_xywh(box: list) -> List[int]:
|
149
|
+
"""Converts [ymin, xmin, ymax, xmax] to [x, y, w, h]."""
|
150
|
+
ymin, xmin, ymax, xmax = box
|
151
|
+
return [xmin, ymin, xmax - xmin, ymax - ymin]
|
152
|
+
|
153
|
+
def detect_tables(image: np.ndarray) -> List[List[int]]:
|
154
|
+
"""Detects tables in an image. Returns xywh format."""
|
155
|
+
boxes = []
|
156
|
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
157
|
+
binary = cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, -2)
|
158
|
+
h_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
|
159
|
+
v_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
|
160
|
+
h_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, h_kernel, iterations=2)
|
161
|
+
v_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, v_kernel, iterations=2)
|
162
|
+
mask = cv2.add(h_lines, v_lines)
|
163
|
+
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
164
|
+
for c in contours:
|
165
|
+
if cv2.contourArea(c) > 2000:
|
166
|
+
x, y, w, h = cv2.boundingRect(c)
|
167
|
+
if w > 50 and h > 50:
|
168
|
+
boxes.append([x, y, w, h])
|
169
|
+
return boxes
|
170
|
+
|
171
|
+
def detect_image_regions(image: np.ndarray, min_area_percentage=1.5) -> List[List[int]]:
|
172
|
+
"""Detects image regions. Returns xywh format."""
|
173
|
+
h, w, _ = image.shape
|
174
|
+
min_area = (min_area_percentage / 100) * (h * w)
|
175
|
+
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
176
|
+
edged = cv2.Canny(cv2.GaussianBlur(gray, (5, 5), 0), 100, 200)
|
177
|
+
contours, _ = cv2.findContours(cv2.dilate(edged, None, iterations=2), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
178
|
+
boxes = []
|
179
|
+
for c in contours:
|
180
|
+
if cv2.contourArea(c) > min_area:
|
181
|
+
x, y, w_box, h_box = cv2.boundingRect(c)
|
182
|
+
if 0.2 < (w_box / float(h_box) if h_box > 0 else 0) < 5.0 and w_box > 80 and h_box > 80:
|
183
|
+
boxes.append([x, y, w_box, h_box])
|
184
|
+
return boxes
|
185
|
+
|
186
|
+
def create_annotated_image(
|
187
|
+
image: np.ndarray,
|
188
|
+
table_boxes: List[List[int]],
|
189
|
+
image_boxes: List[List[int]]
|
190
|
+
) -> np.ndarray:
|
191
|
+
"""Creates annotated image with table and image bounding boxes."""
|
192
|
+
annotated_img = image.copy()
|
193
|
+
|
194
|
+
# Draw table boxes (red)
|
195
|
+
for i, box in enumerate(table_boxes):
|
196
|
+
x, y, w, h = box
|
197
|
+
cv2.rectangle(annotated_img, (x, y), (x + w, y + h), (0, 0, 255), 3)
|
198
|
+
cv2.putText(annotated_img, f"Table {i+1}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 0, 255), 2)
|
199
|
+
|
200
|
+
# Draw image boxes (green)
|
201
|
+
for i, box in enumerate(image_boxes):
|
202
|
+
x, y, w, h = box
|
203
|
+
cv2.rectangle(annotated_img, (x, y), (x + w, y + h), (0, 255, 0), 3)
|
204
|
+
cv2.putText(annotated_img, f"Image {i+1}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
|
205
|
+
|
206
|
+
return annotated_img
|
207
|
+
|
208
|
+
def create_context_image(
|
209
|
+
image: np.ndarray,
|
210
|
+
context_table_boxes: List[Tuple[List[int], int]], # (box, original_index)
|
211
|
+
context_image_boxes: List[Tuple[List[int], int]] # (box, original_index)
|
212
|
+
) -> np.ndarray:
|
213
|
+
"""Creates image with context boxes (all boxes except the one being edited)."""
|
214
|
+
context_img = image.copy()
|
215
|
+
|
216
|
+
# Draw context table boxes (red)
|
217
|
+
for box, original_idx in context_table_boxes:
|
218
|
+
x, y, w, h = box
|
219
|
+
cv2.rectangle(context_img, (x, y), (x + w, y + h), (0, 0, 255), 2)
|
220
|
+
cv2.putText(context_img, f"Table {original_idx + 1}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
|
221
|
+
|
222
|
+
# Draw context image boxes (green)
|
223
|
+
for box, original_idx in context_image_boxes:
|
224
|
+
x, y, w, h = box
|
225
|
+
cv2.rectangle(context_img, (x, y), (x + w, y + h), (0, 255, 0), 2)
|
226
|
+
cv2.putText(context_img, f"Image {original_idx + 1}", (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
|
227
|
+
|
228
|
+
return context_img
|
229
|
+
|
230
|
+
def interactive_editor(img: np.ndarray, initial_box: List[int], editor_title: str) -> List[int]:
|
231
|
+
"""Launches the HTML/JS editor for editing a single bounding box."""
|
232
|
+
_, buffer = cv2.imencode('.png', img)
|
233
|
+
img_str = base64.b64encode(buffer).decode('utf-8')
|
234
|
+
img_data_url = f'data:image/png;base64,{img_str}'
|
235
|
+
|
236
|
+
# Convert single box to list format for the editor
|
237
|
+
initial_boxes = [initial_box] if initial_box else []
|
238
|
+
boxes_json = json.dumps(initial_boxes)
|
239
|
+
|
240
|
+
html_template = f"""
|
241
|
+
<div style="border: 2px solid #ccc; padding: 10px; display: inline-block;">
|
242
|
+
<h3 style="font-family: sans-serif;">{editor_title}</h3>
|
243
|
+
<p style="font-family: sans-serif; margin-top: 0;">
|
244
|
+
<b>Click and drag to draw a box.</b> | <b>Click an existing box to delete.</b>
|
245
|
+
</p>
|
246
|
+
<canvas id="editor-canvas" style="cursor: crosshair; border: 1px solid black;"></canvas>
|
247
|
+
<br>
|
248
|
+
<button id="done-button" style="margin-top: 10px; font-size: 16px; padding: 8px 16px;">โ
Submit Box</button>
|
249
|
+
<div id="status" style="margin-top: 10px; font-family: sans-serif; font-size: 14px;"></div>
|
250
|
+
</div>
|
251
|
+
<script>
|
252
|
+
const canvas = document.getElementById('editor-canvas');
|
253
|
+
const ctx = canvas.getContext('2d');
|
254
|
+
const doneButton = document.getElementById('done-button');
|
255
|
+
const status = document.getElementById('status');
|
256
|
+
const img = new Image();
|
257
|
+
|
258
|
+
window.finished = false;
|
259
|
+
window.finalBoxes = [];
|
260
|
+
let boxes = JSON.parse('{boxes_json}');
|
261
|
+
let isDrawing = false;
|
262
|
+
let startX, startY;
|
263
|
+
|
264
|
+
function updateStatus(message) {{ status.textContent = message; }}
|
265
|
+
|
266
|
+
img.onload = function() {{
|
267
|
+
canvas.width = img.width;
|
268
|
+
canvas.height = img.height;
|
269
|
+
redraw();
|
270
|
+
updateStatus('Image loaded. Ready for editing.');
|
271
|
+
}};
|
272
|
+
img.src = '{img_data_url}';
|
273
|
+
|
274
|
+
function redraw() {{
|
275
|
+
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
276
|
+
ctx.drawImage(img, 0, 0);
|
277
|
+
ctx.strokeStyle = 'blue';
|
278
|
+
ctx.lineWidth = 2;
|
279
|
+
boxes.forEach(([x, y, w, h]) => {{ ctx.strokeRect(x, y, w, h); }});
|
280
|
+
updateStatus(`Current boxes: ${{boxes.length}}`);
|
281
|
+
}}
|
282
|
+
|
283
|
+
canvas.addEventListener('mousedown', (e) => {{
|
284
|
+
const rect = canvas.getBoundingClientRect();
|
285
|
+
const mouseX = e.clientX - rect.left;
|
286
|
+
const mouseY = e.clientY - rect.top;
|
287
|
+
let boxClicked = -1;
|
288
|
+
for (let i = boxes.length - 1; i >= 0; i--) {{
|
289
|
+
const [x, y, w, h] = boxes[i];
|
290
|
+
if (mouseX >= x && mouseX <= x + w && mouseY >= y && mouseY <= y + h) {{
|
291
|
+
boxClicked = i;
|
292
|
+
break;
|
293
|
+
}}
|
294
|
+
}}
|
295
|
+
if (boxClicked !== -1) {{
|
296
|
+
boxes.splice(boxClicked, 1);
|
297
|
+
redraw();
|
298
|
+
updateStatus('Box deleted.');
|
299
|
+
}} else {{
|
300
|
+
isDrawing = true;
|
301
|
+
startX = mouseX;
|
302
|
+
startY = mouseY;
|
303
|
+
updateStatus('Drawing new box...');
|
304
|
+
}}
|
305
|
+
}});
|
306
|
+
|
307
|
+
canvas.addEventListener('mousemove', (e) => {{
|
308
|
+
if (!isDrawing) return;
|
309
|
+
const rect = canvas.getBoundingClientRect();
|
310
|
+
const mouseX = e.clientX - rect.left;
|
311
|
+
const mouseY = e.clientY - rect.top;
|
312
|
+
redraw();
|
313
|
+
ctx.strokeStyle = 'red';
|
314
|
+
ctx.strokeRect(startX, startY, mouseX - startX, mouseY - startY);
|
315
|
+
}});
|
316
|
+
|
317
|
+
canvas.addEventListener('mouseup', (e) => {{
|
318
|
+
if (!isDrawing) return;
|
319
|
+
isDrawing = false;
|
320
|
+
const rect = canvas.getBoundingClientRect();
|
321
|
+
const mouseX = e.clientX - rect.left;
|
322
|
+
const mouseY = e.clientY - rect.top;
|
323
|
+
const x = Math.min(startX, mouseX);
|
324
|
+
const y = Math.min(startY, mouseY);
|
325
|
+
const w = Math.abs(mouseX - startX);
|
326
|
+
const h = Math.abs(mouseY - startY);
|
327
|
+
if (w > 5 && h > 5) {{
|
328
|
+
boxes.push([Math.round(x), Math.round(y), Math.round(w), Math.round(h)]);
|
329
|
+
}}
|
330
|
+
redraw();
|
331
|
+
}});
|
332
|
+
|
333
|
+
doneButton.addEventListener('click', () => {{
|
334
|
+
doneButton.textContent = 'โณ Submitting...';
|
335
|
+
doneButton.disabled = true;
|
336
|
+
canvas.style.cursor = 'default';
|
337
|
+
window.finalBoxes = boxes;
|
338
|
+
window.finished = true;
|
339
|
+
updateStatus('โ
Submitted! Python is now processing...');
|
340
|
+
}});
|
341
|
+
</script>
|
342
|
+
"""
|
343
|
+
|
344
|
+
display(HTML(html_template))
|
345
|
+
print(f"\nโ๏ธ Edit the {editor_title.lower()} above. Click 'Submit' when done.")
|
346
|
+
print("Waiting for manual correction... โณ")
|
347
|
+
|
348
|
+
final_boxes = None
|
349
|
+
for _ in range(600): # Wait for up to 5 minutes
|
350
|
+
try:
|
351
|
+
is_done = output.eval_js('window.finished')
|
352
|
+
if is_done:
|
353
|
+
final_boxes = output.eval_js('window.finalBoxes')
|
354
|
+
break
|
355
|
+
except Exception:
|
356
|
+
pass
|
357
|
+
time.sleep(0.5)
|
358
|
+
|
359
|
+
clear_output(wait=True)
|
360
|
+
if final_boxes is not None and len(final_boxes) > 0:
|
361
|
+
print("โ
Manual corrections received!")
|
362
|
+
return final_boxes[0] # Return the first (and should be only) box
|
363
|
+
else:
|
364
|
+
print("โ ๏ธ No box submitted. Using original box." if initial_box else "โ ๏ธ No box submitted. Box will be removed.")
|
365
|
+
return initial_box if initial_box else None
|
366
|
+
|
367
|
+
# ==================== STAGE 1: UPLOAD, DETECT, & EDIT ====================
|
368
|
+
|
369
|
+
def stage_1():
|
370
|
+
"""
|
371
|
+
Handles document upload, detection, and interactive editing (single pass, no loop).
|
372
|
+
"""
|
373
|
+
print("=" * 60 + "\nSTAGE 1: UPLOAD, DETECT, AND EDIT\n" + "=" * 60)
|
374
|
+
|
375
|
+
# Create directories
|
376
|
+
for folder in ['uploads', 'bounded_images']:
|
377
|
+
os.makedirs(folder, exist_ok=True)
|
378
|
+
|
379
|
+
# Upload file
|
380
|
+
print("\n๐ค Please upload your document image...")
|
381
|
+
uploaded = files.upload()
|
382
|
+
if not uploaded:
|
383
|
+
print("โ No files uploaded.")
|
384
|
+
return
|
385
|
+
|
386
|
+
# Initial setup
|
387
|
+
filename = list(uploaded.keys())[0]
|
388
|
+
filepath = os.path.join('uploads', filename)
|
389
|
+
with open(filepath, 'wb') as f:
|
390
|
+
f.write(uploaded[filename])
|
391
|
+
|
392
|
+
row_id = input(f"โก๏ธ Enter a unique Row ID for '{filename}' (e.g., ID_1): ").strip() or os.path.splitext(filename)[0]
|
393
|
+
original_img = cv2.imread(filepath)
|
394
|
+
|
395
|
+
# Resize for consistent display
|
396
|
+
MAX_WIDTH = 1200
|
397
|
+
original_h, original_w, _ = original_img.shape
|
398
|
+
scale = MAX_WIDTH / original_w if original_w > MAX_WIDTH else 1.0
|
399
|
+
display_w = int(original_w * scale)
|
400
|
+
display_h = int(original_h * scale)
|
401
|
+
display_img = cv2.resize(original_img, (display_w, display_h), interpolation=cv2.INTER_AREA)
|
402
|
+
|
403
|
+
print("\n" + "=" * 50 + f"\nProcessing: {filename} (Row ID: {row_id})\n" + "=" * 50)
|
404
|
+
print("๐ค Running automatic detection...")
|
405
|
+
|
406
|
+
# Detect on original image, then scale for display
|
407
|
+
table_coords_xywh = detect_tables(original_img)
|
408
|
+
image_coords_xywh = detect_image_regions(original_img)
|
409
|
+
|
410
|
+
# Scale coordinates for display
|
411
|
+
table_coords_display = [[int(x * scale), int(y * scale), int(w * scale), int(h * scale)]
|
412
|
+
for x, y, w, h in table_coords_xywh]
|
413
|
+
image_coords_display = [[int(x * scale), int(y * scale), int(w * scale), int(h * scale)]
|
414
|
+
for x, y, w, h in image_coords_xywh]
|
415
|
+
|
416
|
+
print(f"โ
Found {len(table_coords_xywh)} tables and {len(image_coords_xywh)} images.")
|
417
|
+
|
418
|
+
# Show initial detection results
|
419
|
+
current_annotated_img = create_annotated_image(display_img, table_coords_display, image_coords_display)
|
420
|
+
print("\n๐ธ Detection Results (Original vs Annotated):")
|
421
|
+
side_by_side = np.hstack((display_img, current_annotated_img))
|
422
|
+
cv2_imshow(side_by_side)
|
423
|
+
|
424
|
+
# Ask if user wants to edit anything
|
425
|
+
prompt = "\nโ Are you satisfied with these detections?\n"
|
426
|
+
|
427
|
+
if table_coords_display:
|
428
|
+
if len(table_coords_display) == 1:
|
429
|
+
prompt += " - To edit the table, type 'table'\n"
|
430
|
+
else:
|
431
|
+
prompt += f" - To edit tables, type 'table 1' to 'table {len(table_coords_display)}'\n"
|
432
|
+
|
433
|
+
if image_coords_display:
|
434
|
+
if len(image_coords_display) == 1:
|
435
|
+
prompt += " - To edit the image, type 'image'\n"
|
436
|
+
else:
|
437
|
+
prompt += f" - To edit images, type 'image 1' to 'image {len(image_coords_display)}'\n"
|
438
|
+
|
439
|
+
prompt += " - Type 'yes' to approve all and finish\nYour choice: "
|
440
|
+
|
441
|
+
choice = input(prompt).strip().lower()
|
442
|
+
|
443
|
+
if choice == 'yes':
|
444
|
+
print("โ
All annotations approved.")
|
445
|
+
else:
|
446
|
+
# Parse and handle editing request
|
447
|
+
try:
|
448
|
+
if choice in ['table', 'image']:
|
449
|
+
# Single box case
|
450
|
+
if choice == 'table' and len(table_coords_display) == 1:
|
451
|
+
box_type = 'table'
|
452
|
+
box_index = 0
|
453
|
+
elif choice == 'image' and len(image_coords_display) == 1:
|
454
|
+
box_type = 'image'
|
455
|
+
box_index = 0
|
456
|
+
else:
|
457
|
+
print(f"โ Multiple {choice}s detected. Please specify which one (e.g., '{choice} 1').")
|
458
|
+
return
|
459
|
+
else:
|
460
|
+
# Parse "table 1", "image 2", etc.
|
461
|
+
parts = choice.split()
|
462
|
+
if len(parts) != 2:
|
463
|
+
print("โ Invalid format. Please specify which item to edit.")
|
464
|
+
return
|
465
|
+
|
466
|
+
box_type = parts[0]
|
467
|
+
box_index = int(parts[1]) - 1
|
468
|
+
|
469
|
+
if box_type not in ['table', 'image']:
|
470
|
+
print("โ Invalid type. Use 'table' or 'image'.")
|
471
|
+
return
|
472
|
+
|
473
|
+
# Validate index
|
474
|
+
if box_type == 'table':
|
475
|
+
if not (0 <= box_index < len(table_coords_display)):
|
476
|
+
print(f"โ Table {box_index + 1} doesn't exist.")
|
477
|
+
return
|
478
|
+
else: # image
|
479
|
+
if not (0 <= box_index < len(image_coords_display)):
|
480
|
+
print(f"โ Image {box_index + 1} doesn't exist.")
|
481
|
+
return
|
482
|
+
|
483
|
+
except (ValueError, IndexError):
|
484
|
+
print("โ Invalid input. Please enter a valid choice.")
|
485
|
+
return
|
486
|
+
|
487
|
+
# Perform the editing
|
488
|
+
if box_type == 'table':
|
489
|
+
# Get the box being edited
|
490
|
+
box_to_edit = table_coords_display[box_index]
|
491
|
+
|
492
|
+
# Create context: all images + all other tables (with original indices)
|
493
|
+
context_table_boxes = [(box, i) for i, box in enumerate(table_coords_display) if i != box_index]
|
494
|
+
context_image_boxes = [(box, i) for i, box in enumerate(image_coords_display)]
|
495
|
+
|
496
|
+
# Create context image
|
497
|
+
context_img = create_context_image(display_img, context_table_boxes, context_image_boxes)
|
498
|
+
|
499
|
+
# Edit the specific table box
|
500
|
+
print(f"\nโ๏ธ Editing Table {box_index + 1}...")
|
501
|
+
corrected_boxes = interactive_editor(context_img, [], f"Table {box_index + 1} Editor")
|
502
|
+
|
503
|
+
# Update the specific box
|
504
|
+
if corrected_boxes and len(corrected_boxes) > 0:
|
505
|
+
print(f"DEBUG: corrected_boxes = {corrected_boxes}")
|
506
|
+
print(f"DEBUG: corrected_boxes[0] = {corrected_boxes[0]}")
|
507
|
+
print(f"DEBUG: type of corrected_boxes[0] = {type(corrected_boxes[0])}")
|
508
|
+
table_coords_display[box_index] = corrected_boxes # This updates display
|
509
|
+
table_coords_xywh[box_index] = [int(v / scale) for v in corrected_boxes] # This updates final coords
|
510
|
+
else:
|
511
|
+
# Remove the box if None returned
|
512
|
+
del table_coords_display[box_index]
|
513
|
+
del table_coords_xywh[box_index]
|
514
|
+
|
515
|
+
else: # image
|
516
|
+
# Get the box being edited
|
517
|
+
box_to_edit = image_coords_display[box_index]
|
518
|
+
|
519
|
+
# Create context: all tables + all other images (with original indices)
|
520
|
+
context_table_boxes = [(box, i) for i, box in enumerate(table_coords_display)]
|
521
|
+
context_image_boxes = [(box, i) for i, box in enumerate(image_coords_display) if i != box_index]
|
522
|
+
|
523
|
+
# Create context image
|
524
|
+
context_img = create_context_image(display_img, context_table_boxes, context_image_boxes)
|
525
|
+
|
526
|
+
# Edit the specific image box
|
527
|
+
print(f"\nโ๏ธ Editing Image {box_index + 1}...")
|
528
|
+
corrected_box = interactive_editor(context_img, box_to_edit, f"Image {box_index + 1} Editor")
|
529
|
+
|
530
|
+
# Update the specific box
|
531
|
+
if corrected_box:
|
532
|
+
image_coords_display[box_index] = corrected_box
|
533
|
+
# Scale back to original coordinates
|
534
|
+
image_coords_xywh[box_index] = [int(v / scale) for v in corrected_box]
|
535
|
+
else:
|
536
|
+
# Remove the box if None returned
|
537
|
+
del image_coords_display[box_index]
|
538
|
+
del image_coords_xywh[box_index]
|
539
|
+
|
540
|
+
# Show final result: clean original vs updated result
|
541
|
+
final_annotated = create_annotated_image(display_img, table_coords_display, image_coords_display)
|
542
|
+
print("\n๐ Final Result (Original Clean vs Updated):")
|
543
|
+
comparison = np.hstack((display_img, final_annotated))
|
544
|
+
cv2_imshow(comparison)
|
545
|
+
|
546
|
+
# Convert to yminmax format for final output
|
547
|
+
table_coords_yminmax = [xywh_to_yminmax(box) for box in table_coords_xywh]
|
548
|
+
image_coords_yminmax = [xywh_to_yminmax(box) for box in image_coords_xywh]
|
549
|
+
|
550
|
+
# Save final results
|
551
|
+
final_coords = {
|
552
|
+
row_id: {
|
553
|
+
"original_filename": filename,
|
554
|
+
"tables": [[int(v) for v in box] for box in table_coords_yminmax],
|
555
|
+
"images": [[int(v) for v in box] for box in image_coords_yminmax]
|
556
|
+
}
|
557
|
+
}
|
558
|
+
|
559
|
+
with open('coords.json', 'w') as f:
|
560
|
+
json.dump(final_coords, f, indent=4)
|
561
|
+
|
562
|
+
# Save final annotated image (on original resolution)
|
563
|
+
final_annotated_img = create_annotated_image(original_img, table_coords_xywh, image_coords_xywh)
|
564
|
+
bounded_path = os.path.join('bounded_images', f"{row_id}.jpg")
|
565
|
+
cv2.imwrite(bounded_path, final_annotated_img)
|
566
|
+
|
567
|
+
print("\n" + "="*60)
|
568
|
+
print(f"๐พ Saved final coordinates for '{row_id}' to: coords.json")
|
569
|
+
print(f"โ
Saved final annotated image to: {bounded_path}")
|
570
|
+
print("โ
STAGE 1 COMPLETE")
|
571
|
+
print("="*60)
|
572
|
+
|
573
|
+
|
574
|
+
def stage_2(
|
575
|
+
row_id: str,
|
576
|
+
box_type: Optional[str] = None,
|
577
|
+
box_index: Optional[int] = None,
|
578
|
+
custom_coords: Optional[List[int]] = None
|
579
|
+
):
|
580
|
+
"""
|
581
|
+
Tests and visualizes a specific bounding box region from an original image.
|
582
|
+
|
583
|
+
This function can be used in two ways:
|
584
|
+
1. **By Index:** Provide `row_id`, `box_type` ('tables' or 'images'), and `box_index`.
|
585
|
+
2. **By Custom Coordinates:** Provide `row_id` and `custom_coords` as [ymin, xmin, ymax, xmax].
|
586
|
+
"""
|
587
|
+
print("=" * 60)
|
588
|
+
print("STAGE 2: COORDINATE TESTING")
|
589
|
+
print("=" * 60)
|
590
|
+
|
591
|
+
# --- 1. Input Validation ---
|
592
|
+
if custom_coords is None and not (box_type and box_index is not None):
|
593
|
+
print("โ Error: You must provide either `custom_coords` or both `box_type` and `box_index`.")
|
594
|
+
return
|
595
|
+
|
596
|
+
if box_type and box_type not in ['tables', 'images']:
|
597
|
+
print(f"โ Error: `box_type` must be either 'tables' or 'images', not '{box_type}'.")
|
598
|
+
return
|
599
|
+
|
600
|
+
# --- 2. Load Data and Image ---
|
601
|
+
coords_path = 'coords.json'
|
602
|
+
uploads_dir = 'uploads'
|
603
|
+
|
604
|
+
if not os.path.exists(coords_path):
|
605
|
+
print(f"โ Error: '{coords_path}' not found. Please run stage_1() first.")
|
606
|
+
return
|
607
|
+
|
608
|
+
with open(coords_path, 'r') as f:
|
609
|
+
all_coords = json.load(f)
|
610
|
+
|
611
|
+
if row_id not in all_coords:
|
612
|
+
print(f"โ Error: `row_id` '{row_id}' not found in '{coords_path}'.")
|
613
|
+
return
|
614
|
+
|
615
|
+
# Look up the original filename using the row_id
|
616
|
+
original_filename = all_coords[row_id].get("original_filename")
|
617
|
+
if not original_filename:
|
618
|
+
print(f"โ Error: 'original_filename' not found for '{row_id}' in coords.json.")
|
619
|
+
return
|
620
|
+
|
621
|
+
original_image_path = os.path.join(uploads_dir, original_filename)
|
622
|
+
if not os.path.exists(original_image_path):
|
623
|
+
print(f"โ Error: Could not find original image at '{original_image_path}'.")
|
624
|
+
return
|
625
|
+
|
626
|
+
original_image = cv2.imread(original_image_path)
|
627
|
+
if original_image is None:
|
628
|
+
print(f"โ Error: Failed to load image from '{original_image_path}'.")
|
629
|
+
return
|
630
|
+
|
631
|
+
# --- 3. Get Coordinates to Test ---
|
632
|
+
coords_to_test = None
|
633
|
+
if custom_coords:
|
634
|
+
print(f"๐งช Testing custom coordinates for '{row_id}'...")
|
635
|
+
if len(custom_coords) != 4:
|
636
|
+
print("โ Error: `custom_coords` must be a list of 4 integers: [ymin, xmin, ymax, xmax].")
|
637
|
+
return
|
638
|
+
coords_to_test = custom_coords
|
639
|
+
else:
|
640
|
+
print(f"๐งช Testing '{box_type}' at index {box_index} for '{row_id}'...")
|
641
|
+
try:
|
642
|
+
boxes_list = all_coords[row_id][box_type]
|
643
|
+
coords_to_test = boxes_list[box_index]
|
644
|
+
except IndexError:
|
645
|
+
box_count = len(all_coords[row_id].get(box_type, []))
|
646
|
+
print(f"โ Error: `box_index` {box_index} is out of bounds. There are only {box_count} boxes for '{box_type}'.")
|
647
|
+
return
|
648
|
+
except KeyError:
|
649
|
+
print(f"โ Error: `box_type` '{box_type}' not found for '{row_id}'.")
|
650
|
+
return
|
651
|
+
|
652
|
+
# --- 4. Crop and Display ---
|
653
|
+
if coords_to_test:
|
654
|
+
ymin, xmin, ymax, xmax = map(int, coords_to_test)
|
655
|
+
|
656
|
+
# Ensure coordinates are within image bounds
|
657
|
+
h, w, _ = original_image.shape
|
658
|
+
ymin, xmin = max(0, ymin), max(0, xmin)
|
659
|
+
ymax, xmax = min(h, ymax), min(w, xmax)
|
660
|
+
|
661
|
+
if ymin >= ymax or xmin >= xmax:
|
662
|
+
print(f"โ Error: The coordinates {coords_to_test} result in an empty image region.")
|
663
|
+
return
|
664
|
+
|
665
|
+
# Create the side-by-side view
|
666
|
+
image_with_box = original_image.copy()
|
667
|
+
cv2.rectangle(image_with_box, (xmin, ymin), (xmax, ymax), (255, 0, 255), 3) # Bright magenta box
|
668
|
+
|
669
|
+
print(f"\n๐ธ Side-by-Side Preview (Original vs. Tested Coordinate):")
|
670
|
+
cv2_imshow(np.hstack((original_image, image_with_box)))
|
671
|
+
|
672
|
+
# Also show the zoomed-in crop for detail
|
673
|
+
cropped_region = original_image[ymin:ymax, xmin:xmax]
|
674
|
+
print(f"\n๐ผ๏ธ Zoomed-in View of Cropped Region:")
|
675
|
+
cv2_imshow(cropped_region)
|
676
|
+
print("\nโ
STAGE 2 COMPLETE")
|
677
|
+
|
678
|
+
def stage_3(
|
679
|
+
api_key: Optional[str] = None,
|
680
|
+
custom_system_prompt: Optional[str] = None,
|
681
|
+
output_fields: Optional[List[str]] = None,
|
682
|
+
exclude_fields: Optional[List[str]] = None
|
683
|
+
):
|
684
|
+
"""
|
685
|
+
Processes annotated images through LLM with customizable JSON output.
|
686
|
+
|
687
|
+
Args:
|
688
|
+
api_key: Your LLM API key. If None, you will be prompted.
|
689
|
+
custom_system_prompt: An optional custom prompt to override the default.
|
690
|
+
output_fields: A list of strings specifying which keys to INCLUDE.
|
691
|
+
If None, all fields are included by default.
|
692
|
+
exclude_fields: A list of strings specifying which keys to EXCLUDE
|
693
|
+
from the final output. This is applied after `output_fields`.
|
694
|
+
"""
|
695
|
+
print("=" * 60)
|
696
|
+
print("STAGE 3: LLM CONTENT EXTRACTION")
|
697
|
+
print("=" * 60)
|
698
|
+
|
699
|
+
# --- 1. Determine Final Output Fields ---
|
700
|
+
ALL_POSSIBLE_FIELDS = ["Page header", "Page text", "Page footer", "table_bbox", "image_bbox"]
|
701
|
+
|
702
|
+
# Start with the user-defined list or all fields
|
703
|
+
if output_fields is not None:
|
704
|
+
fields_to_include = [field for field in output_fields if field in ALL_POSSIBLE_FIELDS]
|
705
|
+
else:
|
706
|
+
fields_to_include = ALL_POSSIBLE_FIELDS.copy()
|
707
|
+
|
708
|
+
# Apply exclusions if provided
|
709
|
+
if exclude_fields is not None:
|
710
|
+
fields_to_include = [field for field in fields_to_include if field not in exclude_fields]
|
711
|
+
print(f"โ
Excluding fields: {exclude_fields}")
|
712
|
+
|
713
|
+
print(f"โน๏ธ Final JSON will include: {fields_to_include}")
|
714
|
+
|
715
|
+
# --- 2. Configure Gemini API ---
|
716
|
+
if not api_key:
|
717
|
+
try:
|
718
|
+
api_key = getpass("๐ Please enter your Model's API Key: ")
|
719
|
+
except Exception as e:
|
720
|
+
print(f"Could not read API key: {e}")
|
721
|
+
return
|
722
|
+
|
723
|
+
try:
|
724
|
+
genai.configure(api_key=api_key)
|
725
|
+
except Exception as e:
|
726
|
+
print(f"โ Error configuring API: {e}")
|
727
|
+
return
|
728
|
+
|
729
|
+
# --- 3. Define System Prompt ---
|
730
|
+
if custom_system_prompt:
|
731
|
+
system_prompt = custom_system_prompt
|
732
|
+
else:
|
733
|
+
system_prompt = r"""
|
734
|
+
You are a specialist in Spatial Document Intelligence. Your task is to perform Layout-Aware Content Extraction.
|
735
|
+
For each document page, you will analyze its structure, extract all content in the correct reading order, and format the output as a single, clean JSON object.
|
736
|
+
|
737
|
+
**CRITICAL INSTRUCTIONS:**
|
738
|
+
|
739
|
+
1. **Layout Detection & Reading Order:**
|
740
|
+
* Accurately identify the layout: `single_column`, `two_column`, `three_column`, or `four_column`.
|
741
|
+
* Blue vertical lines on the image are visual guides for column boundaries.
|
742
|
+
* For multi-column layouts, extract the ENTIRE first column (leftmost) from top to bottom, THEN the ENTIRE second column, and so on. DO NOT interleave lines between columns.
|
743
|
+
|
744
|
+
2. **Header and Footer Extraction:**
|
745
|
+
* **Decision Rule:** Headers and footers contain metadata ABOUT the document, not THE content OF the document.
|
746
|
+
* **HEADER (Top ~15%):** Page numbers, document titles/IDs (e.g., "NACA RM 56807"), dates, author names, journal titles. Not Figure titles or Table titles.
|
747
|
+
* **FOOTER (Bottom ~15%):** Page numbers, footnotes, copyright notices, references.
|
748
|
+
* **EXCLUDE from Header/Footer:** Section titles (e.g., "RESULTS AND DISCUSSION"), the first paragraph of the main text, table headers, or figure captions should be in "Page text".
|
749
|
+
|
750
|
+
3. **Image Placeholder Insertion:**
|
751
|
+
* Green boxes indicate pre-detected image regions. Your task is to place an `[image]` placeholder in the text where that image logically belongs.
|
752
|
+
* Place the `[image]` placeholder at the nearest paragraph break corresponding to its vertical position in the reading order.
|
753
|
+
* The image's caption text (e.g., "FIGURE 12. Displacement of pipeline...") must be included in the "Page text" immediately after the `[image]` placeholder, as it appears in the document.
|
754
|
+
* The number of `[image]` placeholders MUST match the number of green boxes.
|
755
|
+
|
756
|
+
4. **Mathematical Content (LaTeX Formatting):**
|
757
|
+
* **MANDATORY:** All mathematical expressions MUST be in LaTeX format.
|
758
|
+
* Use `\[ ... \]` for display equations (equations on their own line).
|
759
|
+
* Use `\( ... \)` for inline equations (equations within a line of text).
|
760
|
+
* **CRITICAL FOR JSON VALIDITY:** Every backslash `\` in LaTeX commands MUST be escaped with a second backslash. This is required for the output to be valid JSON.
|
761
|
+
* **Correct:** `"\\(x = \\frac{-b \\pm \\sqrt{b^2-4ac}}{2a}\\)"`
|
762
|
+
* **Incorrect:** `"\(x = \frac{-b \pm \sqrt{b^2-4ac}}{2a}\)"`
|
763
|
+
|
764
|
+
5. **Table Extraction:**
|
765
|
+
* Red boxes indicate pre-detected table regions. Your task is to extract the table content.
|
766
|
+
* Extract all tables into clean, standard HTML `<table>` format.
|
767
|
+
* Use `<thead>`, `<tbody>`, `<tr>`, `<th>`, and `<td>`.
|
768
|
+
* If a header spans multiple rows or columns, explicitly use rowspan or colspan (instead of leaving empty <th> tags).
|
769
|
+
* Ensure the number of columns in the header matches the number of data columns.
|
770
|
+
* Place the entire `<table>...</table>` string in the "Page text" where it appears in the reading order.
|
771
|
+
|
772
|
+
NOTE:
|
773
|
+
**Visual Cues:**
|
774
|
+
* **Red Boxes:** These indicate tables. Your task is to extract the table content.
|
775
|
+
* **Green Boxes:** These indicate images. Place an `[image]` placeholder in the text where the image logically belongs. The image's caption must be included in the "Page text" right after the placeholder.
|
776
|
+
|
777
|
+
|
778
|
+
**OUTPUT FORMAT (Strictly JSON):**
|
779
|
+
Return ONLY a valid JSON object. Do not include any introductory text, explanations, or markdown code fences like ```json.
|
780
|
+
|
781
|
+
{
|
782
|
+
"layout_type": "single_column | two_column | three_column | four_column",
|
783
|
+
"Page header": "Text of the page header.",
|
784
|
+
"Page text": "All body content, including [image] placeholders, LaTeX math, and HTML tables, in correct reading order.",
|
785
|
+
"Page footer": "Text of the page footer."
|
786
|
+
}
|
787
|
+
"""
|
788
|
+
|
789
|
+
# --- 4. Initialize Model and Load Data ---
|
790
|
+
model = genai.GenerativeModel(
|
791
|
+
model_name='gemini-1.5-flash',
|
792
|
+
system_instruction=system_prompt
|
793
|
+
)
|
794
|
+
|
795
|
+
coords_path = 'coords.json'
|
796
|
+
bounded_images_dir = 'bounded_images'
|
797
|
+
final_outputs_dir = 'final_outputs'
|
798
|
+
os.makedirs(final_outputs_dir, exist_ok=True)
|
799
|
+
|
800
|
+
try:
|
801
|
+
with open(coords_path, 'r') as f:
|
802
|
+
all_coords = json.load(f)
|
803
|
+
except FileNotFoundError:
|
804
|
+
print(f"โ Error: '{coords_path}' not found. Please run stage_1() first.")
|
805
|
+
return
|
806
|
+
|
807
|
+
bounded_images = sorted([f for f in os.listdir(bounded_images_dir) if f.endswith('.jpg')])
|
808
|
+
if not bounded_images:
|
809
|
+
print(f"โ Error: No images found in '{bounded_images_dir}/'. Please run stage_1() first.")
|
810
|
+
return
|
811
|
+
|
812
|
+
# --- 5. Main Processing Loop ---
|
813
|
+
print(f"\n๐ Found {len(bounded_images)} annotated image(s) to process.")
|
814
|
+
not_approved_finals = []
|
815
|
+
|
816
|
+
for img_file in bounded_images:
|
817
|
+
row_id = os.path.splitext(img_file)[0]
|
818
|
+
print("\n" + "=" * 50 + f"\nProcessing: {img_file}\n" + "=" * 50)
|
819
|
+
|
820
|
+
if row_id not in all_coords:
|
821
|
+
print(f"โ ๏ธ Warning: No coordinates found for '{row_id}'. Skipping.")
|
822
|
+
continue
|
823
|
+
|
824
|
+
try:
|
825
|
+
img_path = os.path.join(bounded_images_dir, img_file)
|
826
|
+
image_part = {"mime_type": "image/jpeg", "data": open(img_path, 'rb').read()}
|
827
|
+
|
828
|
+
print("โจ Extracting contentโฆ")
|
829
|
+
response = model.generate_content([image_part])
|
830
|
+
|
831
|
+
gem_json_str = response.text.strip()
|
832
|
+
if gem_json_str.startswith("```json"):
|
833
|
+
gem_json_str = gem_json_str[7:-3].strip()
|
834
|
+
|
835
|
+
gem_json = json.loads(gem_json_str)
|
836
|
+
print("โ
Extraction results ready.")
|
837
|
+
|
838
|
+
# Build the final JSON dynamically based on the final list of fields
|
839
|
+
final_json = {}
|
840
|
+
for field in fields_to_include:
|
841
|
+
if field == "Page header":
|
842
|
+
final_json["Page header"] = gem_json.get("Page header", "")
|
843
|
+
elif field == "Page text":
|
844
|
+
final_json["Page text"] = gem_json.get("Page text", "").replace("[image]", "๐ท")
|
845
|
+
elif field == "Page footer":
|
846
|
+
final_json["Page footer"] = gem_json.get("Page footer", "")
|
847
|
+
elif field == "table_bbox":
|
848
|
+
final_json["table_bbox"] = all_coords[row_id].get("tables", [])
|
849
|
+
elif field == "image_bbox":
|
850
|
+
final_json["image_bbox"] = all_coords[row_id].get("images", [])
|
851
|
+
|
852
|
+
print("\n๐ Final JSON for Approval:")
|
853
|
+
print("-" * 40)
|
854
|
+
print(json.dumps(final_json, indent=2))
|
855
|
+
print("-" * 40)
|
856
|
+
|
857
|
+
approval = input("โ Approve this output? (Enter=Yes, n=No): ").strip().lower()
|
858
|
+
if approval == 'n':
|
859
|
+
not_approved_finals.append(img_file)
|
860
|
+
print("โ Marked as not approved. Continuing...")
|
861
|
+
else:
|
862
|
+
output_path = os.path.join(final_outputs_dir, f"{row_id}.json")
|
863
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
864
|
+
json.dump(final_json, f, indent=4, ensure_ascii=False)
|
865
|
+
print(f"โ
Approved and saved to: {output_path}")
|
866
|
+
|
867
|
+
except Exception as e:
|
868
|
+
print(f"โ An error occurred while processing {img_file}: {e}")
|
869
|
+
not_approved_finals.append(img_file)
|
870
|
+
continue
|
871
|
+
|
872
|
+
# --- 6. Final Summary ---
|
873
|
+
print("\n" + "=" * 60 + "\nโ
STAGE 3 COMPLETE")
|
874
|
+
print(f"Total images processed: {len(bounded_images)}")
|
875
|
+
approved_count = len(bounded_images) - len(not_approved_finals)
|
876
|
+
print(f" - Approved and saved: {approved_count}")
|
877
|
+
print(f" - Not approved/Failed: {len(not_approved_finals)}")
|
@@ -0,0 +1,113 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: readgrid
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: A document layout pipeline for detecting tables, images, and structured extraction.
|
5
|
+
Home-page: https://github.com/davidkjeremiah/readgrid
|
6
|
+
Author: David Jeremiah
|
7
|
+
Author-email: flasconnect@gmail.com
|
8
|
+
License: MIT
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
11
|
+
Classifier: Operating System :: OS Independent
|
12
|
+
Requires-Python: >=3.8
|
13
|
+
Description-Content-Type: text/markdown
|
14
|
+
License-File: LICENSE
|
15
|
+
Requires-Dist: opencv-python
|
16
|
+
Requires-Dist: numpy
|
17
|
+
Requires-Dist: Pillow
|
18
|
+
Requires-Dist: google-generativeai
|
19
|
+
Dynamic: author
|
20
|
+
Dynamic: author-email
|
21
|
+
Dynamic: classifier
|
22
|
+
Dynamic: description
|
23
|
+
Dynamic: description-content-type
|
24
|
+
Dynamic: home-page
|
25
|
+
Dynamic: license
|
26
|
+
Dynamic: license-file
|
27
|
+
Dynamic: requires-dist
|
28
|
+
Dynamic: requires-python
|
29
|
+
Dynamic: summary
|
30
|
+
|
31
|
+
# readgrid
|
32
|
+
|
33
|
+
**readgrid** is a Python package for **document layout analysis and content extraction**.
|
34
|
+
It lets you upload scanned documents, automatically detect tables and images, manually adjust bounding boxes, and extract clean structured output using LLMs.
|
35
|
+
|
36
|
+
---
|
37
|
+
|
38
|
+
## โจ Features
|
39
|
+
- **Stage 1 โ Upload & Detect**
|
40
|
+
- Upload document images.
|
41
|
+
- Automatically detect **tables** (red boxes) and **images** (green boxes).
|
42
|
+
- Manually edit bounding boxes with an interactive editor.
|
43
|
+
|
44
|
+
- **Stage 2 โ Coordinate Testing**
|
45
|
+
- Verify detected regions with side-by-side previews.
|
46
|
+
- Test either existing detections or custom coordinates.
|
47
|
+
|
48
|
+
- **Stage 3 โ Content Extraction**
|
49
|
+
- Extract structured JSON output (header, text, footer).
|
50
|
+
- Replace detected tables with clean HTML `<table>` tags.
|
51
|
+
- Insert `[image]` placeholders with captions in reading order.
|
52
|
+
- Supports LaTeX formatting for equations.
|
53
|
+
|
54
|
+
- **Utility Functions**
|
55
|
+
- `pretty_print_page_with_image()` โ inspect extracted results with annotated images.
|
56
|
+
- `show_comparison_view()` โ compare annotated vs. reconstructed content.
|
57
|
+
- `cleanup_pipeline()` โ reset all artifacts.
|
58
|
+
|
59
|
+
---
|
60
|
+
|
61
|
+
## ๐ Installation
|
62
|
+
```bash
|
63
|
+
pip install readgrid
|
64
|
+
```
|
65
|
+
|
66
|
+
---
|
67
|
+
|
68
|
+
## ๐ ๏ธ Usage
|
69
|
+
|
70
|
+
### Stage 1: Upload, Detect, and Edit
|
71
|
+
|
72
|
+
```python
|
73
|
+
from readgrid import stage_1
|
74
|
+
|
75
|
+
stage_1()
|
76
|
+
```
|
77
|
+
|
78
|
+
### Stage 2: Test Coordinates
|
79
|
+
|
80
|
+
```python
|
81
|
+
from readgrid import stage_2
|
82
|
+
|
83
|
+
stage_2(
|
84
|
+
row_id="ID_1",
|
85
|
+
box_type="tables",
|
86
|
+
box_index=0
|
87
|
+
)
|
88
|
+
```
|
89
|
+
|
90
|
+
### Stage 3: Extract with Gemini
|
91
|
+
|
92
|
+
```python
|
93
|
+
from readgrid import stage_3
|
94
|
+
|
95
|
+
stage_3(api_key="YOUR_API_KEY")
|
96
|
+
```
|
97
|
+
|
98
|
+
---
|
99
|
+
|
100
|
+
## ๐ฆ Requirements
|
101
|
+
|
102
|
+
* Python 3.8+
|
103
|
+
* [OpenCV](https://pypi.org/project/opencv-python/)
|
104
|
+
* [NumPy](https://pypi.org/project/numpy/)
|
105
|
+
* [Pillow](https://pypi.org/project/Pillow/)
|
106
|
+
* google-generativeai
|
107
|
+
|
108
|
+
---
|
109
|
+
|
110
|
+
## ๐ License
|
111
|
+
|
112
|
+
MIT License.
|
113
|
+
See [LICENSE](LICENSE) for details.
|
@@ -0,0 +1,7 @@
|
|
1
|
+
readgrid/__init__.py,sha256=yuzTcrV4bHJmbbDsUC4oXvcIXD0ymZTEOylM463t6aw,309
|
2
|
+
readgrid/pipeline.py,sha256=7tn3oSy8lt-XwCQphwHt7CkXLlohWeLMJaq_gUkqd6c,38788
|
3
|
+
readgrid-0.1.0.dist-info/licenses/LICENSE,sha256=c2sTHX7_m7IixtkJva1S3FK0wRbXTDoHIuI9_jDy2ek,1070
|
4
|
+
readgrid-0.1.0.dist-info/METADATA,sha256=cd4ZAfOLM9hUKljCzf-o74GJNy7w93GHt18fGU4Y2tg,2817
|
5
|
+
readgrid-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
6
|
+
readgrid-0.1.0.dist-info/top_level.txt,sha256=UzFgU214mQ3qBzpO96a9UKkFyyVj2lPbvMCwfHddUpo,9
|
7
|
+
readgrid-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
SOFTWARE.
|
20
|
+
|
21
|
+
ยฉ 2025 David
|
@@ -0,0 +1 @@
|
|
1
|
+
readgrid
|