natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,472 +0,0 @@
|
|
1
|
-
(function () {
|
2
|
-
console.log("DEBUG: app.js IIFE started");
|
3
|
-
// Get hooks directly from the React global
|
4
|
-
const { useState, useCallback, useMemo, useRef, useEffect, createContext, useContext } = React;
|
5
|
-
// No need for htmPreact or manual binding with JSX
|
6
|
-
|
7
|
-
const JSZip = window.JSZip;
|
8
|
-
const saveAs = window.saveAs;
|
9
|
-
|
10
|
-
// Create context for the dictionary
|
11
|
-
const DictionaryContext = createContext();
|
12
|
-
|
13
|
-
// Provider component to load and supply dictionary
|
14
|
-
const DictionaryProvider = ({ children }) => {
|
15
|
-
const [dictionary, setDictionary] = useState(new Set());
|
16
|
-
|
17
|
-
useEffect(() => {
|
18
|
-
async function loadWords() {
|
19
|
-
const file = await fetch('words.txt').then(res => res.blob());
|
20
|
-
const wordsSet = new Set((await file.text()).split(/\s+/));
|
21
|
-
setDictionary(wordsSet);
|
22
|
-
}
|
23
|
-
|
24
|
-
loadWords();
|
25
|
-
}, []);
|
26
|
-
|
27
|
-
return (
|
28
|
-
<DictionaryContext.Provider value={dictionary}>
|
29
|
-
{children}
|
30
|
-
</DictionaryContext.Provider>
|
31
|
-
);
|
32
|
-
};
|
33
|
-
|
34
|
-
// Hook to use the dictionary in components
|
35
|
-
const useDictionary = () => {
|
36
|
-
return useContext(DictionaryContext);
|
37
|
-
};
|
38
|
-
|
39
|
-
function FlaggedText({ word }) {
|
40
|
-
const dictionary = useDictionary();
|
41
|
-
if(!dictionary.size) return word;
|
42
|
-
|
43
|
-
function levenshteinDistance(a, b) {
|
44
|
-
if (a.length === 0) return b.length;
|
45
|
-
if (b.length === 0) return a.length;
|
46
|
-
|
47
|
-
const matrix = [];
|
48
|
-
|
49
|
-
for (let i = 0; i <= b.length; i++) {
|
50
|
-
matrix[i] = [i];
|
51
|
-
}
|
52
|
-
|
53
|
-
for (let j = 1; j <= a.length; j++) {
|
54
|
-
matrix[0][j] = j;
|
55
|
-
}
|
56
|
-
|
57
|
-
for (let i = 1; i <= b.length; i++) {
|
58
|
-
for (let j = 1; j <= a.length; j++) {
|
59
|
-
const cost = (b[i - 1] === a[j - 1]) ? 0 : 1;
|
60
|
-
|
61
|
-
matrix[i][j] = Math.min(
|
62
|
-
matrix[i - 1][j - 1] + cost, // substitution
|
63
|
-
matrix[i][j - 1] + 1, // insertion
|
64
|
-
matrix[i - 1][j] + 1 // deletion
|
65
|
-
);
|
66
|
-
}
|
67
|
-
}
|
68
|
-
|
69
|
-
return matrix[b.length][a.length];
|
70
|
-
}
|
71
|
-
|
72
|
-
function suggestClosestMatch(word) {
|
73
|
-
let bestMatch = null;
|
74
|
-
let minDistance = Infinity;
|
75
|
-
|
76
|
-
for (const candidate of dictionary) {
|
77
|
-
if(Math.abs(word.length - candidate.length) > 2)
|
78
|
-
continue;
|
79
|
-
const distance = levenshteinDistance(word, candidate);
|
80
|
-
|
81
|
-
if (distance < minDistance) {
|
82
|
-
bestMatch = candidate;
|
83
|
-
minDistance = distance;
|
84
|
-
}
|
85
|
-
}
|
86
|
-
|
87
|
-
return bestMatch || null; // Return the closest match or null if none found
|
88
|
-
}
|
89
|
-
|
90
|
-
|
91
|
-
const closestWord = suggestClosestMatch(word);
|
92
|
-
|
93
|
-
return <span>
|
94
|
-
<span style={{ backgroundColor: 'yellow' }}>{word} </span>
|
95
|
-
<span style={{ backgroundColor: 'lightgreen' }}>{closestWord}</span>
|
96
|
-
</span>;
|
97
|
-
}
|
98
|
-
|
99
|
-
function CheckedText({ inputText }) {
|
100
|
-
const dictionary = useDictionary();
|
101
|
-
|
102
|
-
if (!dictionary.size) return <p>Loading dictionary...</p>;
|
103
|
-
|
104
|
-
const processText = () => {
|
105
|
-
const wordsAndPunctuations = inputText.split(/(\b|\W)/g);
|
106
|
-
|
107
|
-
return wordsAndPunctuations.map((word, index) => {
|
108
|
-
if (!word.trim()) return word;
|
109
|
-
|
110
|
-
if (
|
111
|
-
/[^\d]/.test(word)
|
112
|
-
&& word.length > 2
|
113
|
-
&& /^\w+$/.test(word)
|
114
|
-
&& !dictionary.has(word)
|
115
|
-
&& !dictionary.has(word.toLowerCase())
|
116
|
-
&& !(word.endsWith('s') && dictionary.has(word.slice(0, -1).toLowerCase()))
|
117
|
-
) {
|
118
|
-
return word
|
119
|
-
// return <FlaggedText word={word} />;
|
120
|
-
}
|
121
|
-
|
122
|
-
return word;
|
123
|
-
});
|
124
|
-
};
|
125
|
-
|
126
|
-
return (
|
127
|
-
<div>
|
128
|
-
{processText()}
|
129
|
-
</div>
|
130
|
-
);
|
131
|
-
}
|
132
|
-
|
133
|
-
// --- Region Row Component ---
|
134
|
-
function RegionRow({ region, imageUrl, pageData, pageIndex, regionIndex, onTextChange, onEnterPress }) {
|
135
|
-
const textRef = useRef(null);
|
136
|
-
const canvasRef = useRef(null);
|
137
|
-
|
138
|
-
function handleContentEditableChange(event) {
|
139
|
-
const newText = event.target.innerText;
|
140
|
-
onTextChange(pageIndex, regionIndex, newText);
|
141
|
-
}
|
142
|
-
|
143
|
-
// Ensure the defensive check is present
|
144
|
-
if (!region || !region.bbox || region.bbox.length !== 4) {
|
145
|
-
console.warn("RegionRow received invalid region prop - skipping render.", { region });
|
146
|
-
return null; // Don't render anything if region is invalid
|
147
|
-
}
|
148
|
-
|
149
|
-
// --- Calculate dimensions ---
|
150
|
-
const imgScale = 1.0; // Set to 1.0 assuming bbox coords match image pixels
|
151
|
-
const [x0, y0, x1, y1] = region.bbox;
|
152
|
-
// Source coordinates and dimensions on the original image (now directly from bbox)
|
153
|
-
const sourceX = x0 * imgScale; // Now just x0
|
154
|
-
const sourceY = y0 * imgScale; // Now just y0
|
155
|
-
const sourceWidth = (x1 - x0) * imgScale; // Now just width from bbox
|
156
|
-
const sourceHeight = (y1 - y0) * imgScale; // Now just height from bbox
|
157
|
-
|
158
|
-
// --- useEffect for drawing on canvas ---
|
159
|
-
useEffect(() => {
|
160
|
-
if (!imageUrl || !canvasRef.current) {
|
161
|
-
return; // Don't draw if no image URL or canvas isn't ready
|
162
|
-
}
|
163
|
-
|
164
|
-
const canvas = canvasRef.current;
|
165
|
-
const ctx = canvas.getContext('2d');
|
166
|
-
const img = new Image();
|
167
|
-
|
168
|
-
img.onload = () => {
|
169
|
-
// Set canvas intrinsic size to match the source snippet dimensions
|
170
|
-
canvas.width = sourceWidth;
|
171
|
-
canvas.height = sourceHeight;
|
172
|
-
|
173
|
-
// Draw the specific region from the loaded image onto the canvas
|
174
|
-
ctx.drawImage(
|
175
|
-
img, // Source image
|
176
|
-
sourceX, // Source X
|
177
|
-
sourceY, // Source Y
|
178
|
-
sourceWidth,// Source Width
|
179
|
-
sourceHeight,// Source Height
|
180
|
-
0, // Destination X (on canvas)
|
181
|
-
0, // Destination Y (on canvas)
|
182
|
-
sourceWidth,// Destination Width (on canvas)
|
183
|
-
sourceHeight // Destination Height (on canvas)
|
184
|
-
);
|
185
|
-
};
|
186
|
-
|
187
|
-
img.onerror = (err) => {
|
188
|
-
console.error("Failed to load image for canvas:", imageUrl, err);
|
189
|
-
// Optionally draw an error message or clear the canvas
|
190
|
-
ctx.clearRect(0, 0, canvas.width, canvas.height);
|
191
|
-
ctx.fillStyle = "red";
|
192
|
-
ctx.fillText("Error loading image", 10, 20);
|
193
|
-
};
|
194
|
-
|
195
|
-
img.src = imageUrl; // Start loading the image
|
196
|
-
|
197
|
-
// No cleanup needed for image loading, but good practice for other effects
|
198
|
-
// return () => { /* cleanup code */ };
|
199
|
-
|
200
|
-
}, [imageUrl, region, sourceX, sourceY, sourceWidth, sourceHeight]); // Re-run effect if image or region changes
|
201
|
-
|
202
|
-
|
203
|
-
// Style for the canvas container/scaling
|
204
|
-
const canvasStyle = {
|
205
|
-
// Display at 2x size (defined by sourceWidth)
|
206
|
-
width: `${sourceWidth}px`,
|
207
|
-
// Let height adjust automatically based on width and aspect ratio
|
208
|
-
height: `auto`,
|
209
|
-
// But constrain width to container
|
210
|
-
maxWidth: '100%',
|
211
|
-
// Remove transform scaling
|
212
|
-
// transform: displayScale < 1 ? `scale(${displayScale})` : 'none',
|
213
|
-
// transformOrigin: 'top left',
|
214
|
-
// Keep other relevant styles
|
215
|
-
borderRadius: '3px',
|
216
|
-
boxShadow: '0 1px 3px rgba(0,0,0,0.2)',
|
217
|
-
display: 'inline-block', // Or 'block' depending on desired layout flow
|
218
|
-
margin: '0 auto',
|
219
|
-
};
|
220
|
-
// --- END Canvas Logic ---
|
221
|
-
|
222
|
-
const confidenceLevel = region.confidence >= 0.8 ? 'high' : (region.confidence >= 0.5 ? 'medium' : 'low');
|
223
|
-
const confidenceText = region.confidence !== null && region.confidence !== undefined ? region.confidence.toFixed(2) : 'N/A';
|
224
|
-
const handleInputChange = (e) => {
|
225
|
-
onTextChange(pageIndex, regionIndex, e.target.value);
|
226
|
-
};
|
227
|
-
// const handleFocus = (e) => {
|
228
|
-
// e.target.select();
|
229
|
-
// };
|
230
|
-
const handleKeyDown = (e) => {
|
231
|
-
if (e.key === 'Enter') {
|
232
|
-
e.preventDefault();
|
233
|
-
onEnterPress(e.target);
|
234
|
-
}
|
235
|
-
};
|
236
|
-
|
237
|
-
// Convert to JSX
|
238
|
-
return (
|
239
|
-
<div className="region-item" data-region-id={region.id} data-confidence={region.confidence} data-modified={region.modified}>
|
240
|
-
<div className="confidence-cell" data-level={confidenceLevel}>
|
241
|
-
{confidenceText}
|
242
|
-
</div>
|
243
|
-
|
244
|
-
<div className="region-content-cell">
|
245
|
-
{/* Replace div with canvas */}
|
246
|
-
<canvas
|
247
|
-
ref={canvasRef}
|
248
|
-
style={canvasStyle}
|
249
|
-
// Set initial width/height perhaps? Or rely purely on useEffect
|
250
|
-
// width={sourceWidth} // Setting via useEffect is generally better
|
251
|
-
// height={sourceHeight}
|
252
|
-
className="image-clip-canvas" // Add a class for potential styling
|
253
|
-
></canvas>
|
254
|
-
<div
|
255
|
-
ref={textRef}
|
256
|
-
contentEditable
|
257
|
-
// rows={Math.max(1, Math.ceil((region.corrected_text || '').length / 50))}
|
258
|
-
data-page-index={pageIndex}
|
259
|
-
data-region-index={regionIndex}
|
260
|
-
data-original-text={region.ocr_text}
|
261
|
-
// onInput={handleInputChange}
|
262
|
-
// onFocus={handleFocus}
|
263
|
-
// onKeyDown={handleKeyDown}
|
264
|
-
className={`editing-content ${region.modified ? 'modified' : ''}`}
|
265
|
-
// onInput={handleContentEditableChange} // Handle text changes
|
266
|
-
>
|
267
|
-
<CheckedText inputText={region.corrected_text} />
|
268
|
-
{/* <CheckedText inputText={region.ocr_text} /> */}
|
269
|
-
</div>
|
270
|
-
</div>
|
271
|
-
</div>
|
272
|
-
);
|
273
|
-
}
|
274
|
-
|
275
|
-
// --- Region Table Component ---
|
276
|
-
function RegionTable({ regions, imageUrl, pageData, pageIndex, onTextChange, onEnterPress }) {
|
277
|
-
// Ensure filtering is done using useMemo for efficiency
|
278
|
-
const validRegions = useMemo(() =>
|
279
|
-
(regions || []).filter(r => r && r.id && r.bbox && typeof r.ocr_text === 'string'),
|
280
|
-
[regions] // Recalculate only when regions array changes
|
281
|
-
);
|
282
|
-
|
283
|
-
if (!validRegions || validRegions.length === 0) {
|
284
|
-
// Convert to JSX
|
285
|
-
return <p>No valid OCR regions to display for this page section.</p>;
|
286
|
-
}
|
287
|
-
|
288
|
-
// Convert to JSX
|
289
|
-
return (
|
290
|
-
<div className="region-list">
|
291
|
-
{validRegions.map((region, rIndex) => (
|
292
|
-
<RegionRow
|
293
|
-
key={region.id}
|
294
|
-
region={region}
|
295
|
-
imageUrl={imageUrl}
|
296
|
-
pageData={pageData}
|
297
|
-
pageIndex={pageIndex}
|
298
|
-
regionIndex={rIndex}
|
299
|
-
onTextChange={onTextChange}
|
300
|
-
onEnterPress={onEnterPress}
|
301
|
-
/>
|
302
|
-
))}
|
303
|
-
</div>
|
304
|
-
);
|
305
|
-
}
|
306
|
-
|
307
|
-
// --- Main Application Component ---
|
308
|
-
function App() {
|
309
|
-
// State for the application
|
310
|
-
const [taskData, setTaskData] = useState(null); // Holds the parsed manifest.json
|
311
|
-
const [imageData, setImageData] = useState({}); // Holds { relativePath: objectURL }
|
312
|
-
const [isLoading, setIsLoading] = useState(false);
|
313
|
-
const [error, setError] = useState(null);
|
314
|
-
const [currentFilename, setCurrentFilename] = useState(''); // Name of the loaded zip
|
315
|
-
|
316
|
-
// --- File Handling ---
|
317
|
-
const handleFileChange = useCallback(async (event) => {
|
318
|
-
const file = event.target.files[0];
|
319
|
-
if (!file) return;
|
320
|
-
|
321
|
-
setIsLoading(true);
|
322
|
-
setError(null);
|
323
|
-
setTaskData(null);
|
324
|
-
setCurrentFilename(file.name);
|
325
|
-
// Revoke previous object URLs
|
326
|
-
Object.values(imageData).forEach(URL.revokeObjectURL);
|
327
|
-
setImageData({});
|
328
|
-
|
329
|
-
try {
|
330
|
-
const zip = await JSZip.loadAsync(file); // Load zip file
|
331
|
-
|
332
|
-
// 1. Load Manifest
|
333
|
-
const manifestFile = zip.file("manifest.json");
|
334
|
-
if (!manifestFile) {
|
335
|
-
throw new Error("manifest.json not found in the zip file.");
|
336
|
-
}
|
337
|
-
const manifestContent = await manifestFile.async("string");
|
338
|
-
const parsedManifest = JSON.parse(manifestContent);
|
339
|
-
// TODO: Add validation for manifest structure?
|
340
|
-
setTaskData(parsedManifest);
|
341
|
-
|
342
|
-
// 2. Load Images and create Object URLs
|
343
|
-
const imagePromises = [];
|
344
|
-
const newImageData = {};
|
345
|
-
zip.folder("images").forEach((relativePath, fileEntry) => {
|
346
|
-
if (!fileEntry.dir) {
|
347
|
-
imagePromises.push(
|
348
|
-
fileEntry.async("blob").then(blob => {
|
349
|
-
const objectURL = URL.createObjectURL(blob);
|
350
|
-
// Store URL mapped to the relative path used in manifest
|
351
|
-
newImageData[`images/${relativePath}`] = objectURL;
|
352
|
-
})
|
353
|
-
);
|
354
|
-
}
|
355
|
-
});
|
356
|
-
|
357
|
-
await Promise.all(imagePromises);
|
358
|
-
setImageData(newImageData);
|
359
|
-
console.log("Loaded images:", Object.keys(newImageData));
|
360
|
-
|
361
|
-
|
362
|
-
} catch (err) {
|
363
|
-
console.error("Error loading task package:", err);
|
364
|
-
setError(`Error loading task package: ${err.message}`);
|
365
|
-
setTaskData(null); // Clear data on error
|
366
|
-
setImageData({});
|
367
|
-
setCurrentFilename('');
|
368
|
-
} finally {
|
369
|
-
setIsLoading(false);
|
370
|
-
// Reset file input value so the same file can be loaded again
|
371
|
-
event.target.value = null;
|
372
|
-
}
|
373
|
-
}, [imageData]); // Depend on imageData to revoke old URLs
|
374
|
-
|
375
|
-
// --- Text Area Change Handler ---
|
376
|
-
const handleTextChange = (pageIndex, regionIndex, newText) => {
|
377
|
-
setTaskData(prevData => {
|
378
|
-
if (!prevData) return null; // Handle null state
|
379
|
-
|
380
|
-
const newData = { ...prevData }; // Shallow copy top-level object
|
381
|
-
const newPages = [...newData.pages]; // Create a shallow copy of pages array
|
382
|
-
|
383
|
-
// Ensure pageIndex and regionIndex are within bounds to prevent errors
|
384
|
-
if (newPages[pageIndex] && Array.isArray(newPages[pageIndex].regions)) {
|
385
|
-
const newRegions = [...newPages[pageIndex].regions]; // Shallow copy regions array
|
386
|
-
const region = newRegions[regionIndex];
|
387
|
-
|
388
|
-
// Update only the specific region's corrected_text and modified fields
|
389
|
-
if (region) {
|
390
|
-
region.corrected_text = newText;
|
391
|
-
region.modified = newText !== region.ocr_text;
|
392
|
-
newRegions[regionIndex] = region; // Reassign updated region to array
|
393
|
-
|
394
|
-
newPages[pageIndex].regions = newRegions; // Reassign updated regions array to page
|
395
|
-
}
|
396
|
-
}
|
397
|
-
|
398
|
-
newData.pages = newPages; // Reassign updated pages array to data object
|
399
|
-
return newData;
|
400
|
-
});
|
401
|
-
};
|
402
|
-
|
403
|
-
// --- Enter Key Navigation Handler ---
|
404
|
-
const handleEnterNavigation = (currentTextArea) => {
|
405
|
-
const allTextAreas = Array.from(document.querySelectorAll('.text-content-input'));
|
406
|
-
const currentIndex = allTextAreas.indexOf(currentTextArea);
|
407
|
-
|
408
|
-
if (currentIndex > -1 && currentIndex < allTextAreas.length - 1) {
|
409
|
-
const nextTextArea = allTextAreas[currentIndex + 1];
|
410
|
-
nextTextArea.focus();
|
411
|
-
}
|
412
|
-
};
|
413
|
-
|
414
|
-
// --- UI Rendering (Convert to JSX) ---
|
415
|
-
return (
|
416
|
-
<DictionaryProvider>
|
417
|
-
<div className="app-container">
|
418
|
-
<div className="task-loader">
|
419
|
-
<label htmlFor="zip-input">Load Correction Task Package (.zip): </label>
|
420
|
-
<input type="file" id="zip-input" accept=".zip" onChange={handleFileChange} disabled={isLoading} />
|
421
|
-
{isLoading && <span> Loading...</span>}
|
422
|
-
</div>
|
423
|
-
|
424
|
-
{error && <div className="error-message" style={{ color: 'red', margin: '10px', padding: '10px', border: '1px solid red' }}>{error}</div>}
|
425
|
-
|
426
|
-
{!isLoading && !taskData && !error && (
|
427
|
-
<div className="initial-message">
|
428
|
-
<p>Please load a .zip task package to begin.</p>
|
429
|
-
</div>
|
430
|
-
)}
|
431
|
-
|
432
|
-
{taskData && (
|
433
|
-
<div className="task-content">
|
434
|
-
<h2>Task: {currentFilename}</h2>
|
435
|
-
<p>PDF Source: {taskData.pdfs && taskData.pdfs.length > 0 ? taskData.pdfs[0].source : (taskData.pages[0]?.pdf_source || 'Unknown')} ({taskData.pages?.length || 0} pages)</p>
|
436
|
-
|
437
|
-
<div className="controls-container">
|
438
|
-
{/* TODO: Add Export Functionality */}
|
439
|
-
<button id="export-corrections" className="export-btn">Export Corrections JSON</button>
|
440
|
-
</div>
|
441
|
-
|
442
|
-
<div className="pages-container">
|
443
|
-
{taskData.pages.map((page, pIndex) => (
|
444
|
-
<div className="page-section" key={page.image_path}>
|
445
|
-
<div className="page-title">
|
446
|
-
<h3>Page {page.page_number} (Source: {page.pdf_short_id})</h3>
|
447
|
-
</div>
|
448
|
-
<RegionTable
|
449
|
-
regions={page.regions}
|
450
|
-
imageUrl={imageData[page.image_path]} /* Pass the object URL */
|
451
|
-
pageData={page} /* Pass page metadata (width, height) */
|
452
|
-
pageIndex={pIndex}
|
453
|
-
onTextChange={handleTextChange}
|
454
|
-
onEnterPress={handleEnterNavigation}
|
455
|
-
/>
|
456
|
-
</div>
|
457
|
-
))}
|
458
|
-
</div>
|
459
|
-
</div>
|
460
|
-
)}
|
461
|
-
</div>
|
462
|
-
</DictionaryProvider>
|
463
|
-
);
|
464
|
-
}
|
465
|
-
|
466
|
-
console.log("DEBUG: Mounting React app...");
|
467
|
-
// Mount the app to the DOM using ReactDOM.createRoot
|
468
|
-
const root = ReactDOM.createRoot(document.getElementById('app'));
|
469
|
-
root.render(<App />);
|
470
|
-
console.log("DEBUG: React app mount initiated.");
|
471
|
-
|
472
|
-
})(); // Immediately invoke the function
|