natural-pdf 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. docs/ocr/index.md +34 -47
  2. docs/tutorials/01-loading-and-extraction.ipynb +60 -46
  3. docs/tutorials/02-finding-elements.ipynb +42 -42
  4. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  5. docs/tutorials/04-table-extraction.ipynb +12 -12
  6. docs/tutorials/05-excluding-content.ipynb +30 -30
  7. docs/tutorials/06-document-qa.ipynb +28 -28
  8. docs/tutorials/07-layout-analysis.ipynb +63 -35
  9. docs/tutorials/07-working-with-regions.ipynb +55 -51
  10. docs/tutorials/07-working-with-regions.md +2 -2
  11. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  12. docs/tutorials/09-section-extraction.ipynb +113 -113
  13. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  14. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  15. docs/tutorials/12-ocr-integration.ipynb +149 -131
  16. docs/tutorials/12-ocr-integration.md +0 -13
  17. docs/tutorials/13-semantic-search.ipynb +313 -873
  18. natural_pdf/__init__.py +21 -23
  19. natural_pdf/analyzers/layout/gemini.py +264 -0
  20. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  21. natural_pdf/analyzers/layout/layout_options.py +11 -0
  22. natural_pdf/analyzers/layout/yolo.py +6 -2
  23. natural_pdf/collections/pdf_collection.py +21 -0
  24. natural_pdf/core/element_manager.py +16 -13
  25. natural_pdf/core/page.py +165 -36
  26. natural_pdf/core/pdf.py +146 -41
  27. natural_pdf/elements/base.py +11 -17
  28. natural_pdf/elements/collections.py +100 -38
  29. natural_pdf/elements/region.py +77 -38
  30. natural_pdf/elements/text.py +5 -0
  31. natural_pdf/ocr/__init__.py +49 -36
  32. natural_pdf/ocr/engine.py +146 -51
  33. natural_pdf/ocr/engine_easyocr.py +141 -161
  34. natural_pdf/ocr/engine_paddle.py +107 -193
  35. natural_pdf/ocr/engine_surya.py +75 -148
  36. natural_pdf/ocr/ocr_factory.py +114 -0
  37. natural_pdf/ocr/ocr_manager.py +65 -93
  38. natural_pdf/ocr/ocr_options.py +7 -17
  39. natural_pdf/ocr/utils.py +98 -0
  40. natural_pdf/templates/spa/css/style.css +334 -0
  41. natural_pdf/templates/spa/index.html +31 -0
  42. natural_pdf/templates/spa/js/app.js +472 -0
  43. natural_pdf/templates/spa/words.txt +235976 -0
  44. natural_pdf/utils/debug.py +32 -0
  45. natural_pdf/utils/identifiers.py +29 -0
  46. natural_pdf/utils/packaging.py +418 -0
  47. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +41 -19
  48. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/RECORD +51 -44
  49. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  50. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/top_level.txt +0 -1
  51. natural_pdf/templates/ocr_debug.html +0 -517
  52. tests/test_loading.py +0 -50
  53. tests/test_optional_deps.py +0 -298
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,472 @@
1
+ (function () {
2
+ console.log("DEBUG: app.js IIFE started");
3
+ // Get hooks directly from the React global
4
+ const { useState, useCallback, useMemo, useRef, useEffect, createContext, useContext } = React;
5
+ // No need for htmPreact or manual binding with JSX
6
+
7
+ const JSZip = window.JSZip;
8
+ const saveAs = window.saveAs;
9
+
10
+ // Create context for the dictionary
11
+ const DictionaryContext = createContext();
12
+
13
+ // Provider component to load and supply dictionary
14
+ const DictionaryProvider = ({ children }) => {
15
+ const [dictionary, setDictionary] = useState(new Set());
16
+
17
+ useEffect(() => {
18
+ async function loadWords() {
19
+ const file = await fetch('words.txt').then(res => res.blob());
20
+ const wordsSet = new Set((await file.text()).split(/\s+/));
21
+ setDictionary(wordsSet);
22
+ }
23
+
24
+ loadWords();
25
+ }, []);
26
+
27
+ return (
28
+ <DictionaryContext.Provider value={dictionary}>
29
+ {children}
30
+ </DictionaryContext.Provider>
31
+ );
32
+ };
33
+
34
+ // Hook to use the dictionary in components
35
+ const useDictionary = () => {
36
+ return useContext(DictionaryContext);
37
+ };
38
+
39
+ function FlaggedText({ word }) {
40
+ const dictionary = useDictionary();
41
+ if(!dictionary.size) return word;
42
+
43
+ function levenshteinDistance(a, b) {
44
+ if (a.length === 0) return b.length;
45
+ if (b.length === 0) return a.length;
46
+
47
+ const matrix = [];
48
+
49
+ for (let i = 0; i <= b.length; i++) {
50
+ matrix[i] = [i];
51
+ }
52
+
53
+ for (let j = 1; j <= a.length; j++) {
54
+ matrix[0][j] = j;
55
+ }
56
+
57
+ for (let i = 1; i <= b.length; i++) {
58
+ for (let j = 1; j <= a.length; j++) {
59
+ const cost = (b[i - 1] === a[j - 1]) ? 0 : 1;
60
+
61
+ matrix[i][j] = Math.min(
62
+ matrix[i - 1][j - 1] + cost, // substitution
63
+ matrix[i][j - 1] + 1, // insertion
64
+ matrix[i - 1][j] + 1 // deletion
65
+ );
66
+ }
67
+ }
68
+
69
+ return matrix[b.length][a.length];
70
+ }
71
+
72
+ function suggestClosestMatch(word) {
73
+ let bestMatch = null;
74
+ let minDistance = Infinity;
75
+
76
+ for (const candidate of dictionary) {
77
+ if(Math.abs(word.length - candidate.length) > 2)
78
+ continue;
79
+ const distance = levenshteinDistance(word, candidate);
80
+
81
+ if (distance < minDistance) {
82
+ bestMatch = candidate;
83
+ minDistance = distance;
84
+ }
85
+ }
86
+
87
+ return bestMatch || null; // Return the closest match or null if none found
88
+ }
89
+
90
+
91
+ const closestWord = suggestClosestMatch(word);
92
+
93
+ return <span>
94
+ <span style={{ backgroundColor: 'yellow' }}>{word} </span>
95
+ <span style={{ backgroundColor: 'lightgreen' }}>{closestWord}</span>
96
+ </span>;
97
+ }
98
+
99
+ function CheckedText({ inputText }) {
100
+ const dictionary = useDictionary();
101
+
102
+ if (!dictionary.size) return <p>Loading dictionary...</p>;
103
+
104
+ const processText = () => {
105
+ const wordsAndPunctuations = inputText.split(/(\b|\W)/g);
106
+
107
+ return wordsAndPunctuations.map((word, index) => {
108
+ if (!word.trim()) return word;
109
+
110
+ if (
111
+ /[^\d]/.test(word)
112
+ && word.length > 2
113
+ && /^\w+$/.test(word)
114
+ && !dictionary.has(word)
115
+ && !dictionary.has(word.toLowerCase())
116
+ && !(word.endsWith('s') && dictionary.has(word.slice(0, -1).toLowerCase()))
117
+ ) {
118
+ return word
119
+ // return <FlaggedText word={word} />;
120
+ }
121
+
122
+ return word;
123
+ });
124
+ };
125
+
126
+ return (
127
+ <div>
128
+ {processText()}
129
+ </div>
130
+ );
131
+ }
132
+
133
+ // --- Region Row Component ---
134
+ function RegionRow({ region, imageUrl, pageData, pageIndex, regionIndex, onTextChange, onEnterPress }) {
135
+ const textRef = useRef(null);
136
+ const canvasRef = useRef(null);
137
+
138
+ function handleContentEditableChange(event) {
139
+ const newText = event.target.innerText;
140
+ onTextChange(pageIndex, regionIndex, newText);
141
+ }
142
+
143
+ // Ensure the defensive check is present
144
+ if (!region || !region.bbox || region.bbox.length !== 4) {
145
+ console.warn("RegionRow received invalid region prop - skipping render.", { region });
146
+ return null; // Don't render anything if region is invalid
147
+ }
148
+
149
+ // --- Calculate dimensions ---
150
+ const imgScale = 1.0; // Set to 1.0 assuming bbox coords match image pixels
151
+ const [x0, y0, x1, y1] = region.bbox;
152
+ // Source coordinates and dimensions on the original image (now directly from bbox)
153
+ const sourceX = x0 * imgScale; // Now just x0
154
+ const sourceY = y0 * imgScale; // Now just y0
155
+ const sourceWidth = (x1 - x0) * imgScale; // Now just width from bbox
156
+ const sourceHeight = (y1 - y0) * imgScale; // Now just height from bbox
157
+
158
+ // --- useEffect for drawing on canvas ---
159
+ useEffect(() => {
160
+ if (!imageUrl || !canvasRef.current) {
161
+ return; // Don't draw if no image URL or canvas isn't ready
162
+ }
163
+
164
+ const canvas = canvasRef.current;
165
+ const ctx = canvas.getContext('2d');
166
+ const img = new Image();
167
+
168
+ img.onload = () => {
169
+ // Set canvas intrinsic size to match the source snippet dimensions
170
+ canvas.width = sourceWidth;
171
+ canvas.height = sourceHeight;
172
+
173
+ // Draw the specific region from the loaded image onto the canvas
174
+ ctx.drawImage(
175
+ img, // Source image
176
+ sourceX, // Source X
177
+ sourceY, // Source Y
178
+ sourceWidth,// Source Width
179
+ sourceHeight,// Source Height
180
+ 0, // Destination X (on canvas)
181
+ 0, // Destination Y (on canvas)
182
+ sourceWidth,// Destination Width (on canvas)
183
+ sourceHeight // Destination Height (on canvas)
184
+ );
185
+ };
186
+
187
+ img.onerror = (err) => {
188
+ console.error("Failed to load image for canvas:", imageUrl, err);
189
+ // Optionally draw an error message or clear the canvas
190
+ ctx.clearRect(0, 0, canvas.width, canvas.height);
191
+ ctx.fillStyle = "red";
192
+ ctx.fillText("Error loading image", 10, 20);
193
+ };
194
+
195
+ img.src = imageUrl; // Start loading the image
196
+
197
+ // No cleanup needed for image loading, but good practice for other effects
198
+ // return () => { /* cleanup code */ };
199
+
200
+ }, [imageUrl, region, sourceX, sourceY, sourceWidth, sourceHeight]); // Re-run effect if image or region changes
201
+
202
+
203
+ // Style for the canvas container/scaling
204
+ const canvasStyle = {
205
+ // Display at 2x size (defined by sourceWidth)
206
+ width: `${sourceWidth}px`,
207
+ // Let height adjust automatically based on width and aspect ratio
208
+ height: `auto`,
209
+ // But constrain width to container
210
+ maxWidth: '100%',
211
+ // Remove transform scaling
212
+ // transform: displayScale < 1 ? `scale(${displayScale})` : 'none',
213
+ // transformOrigin: 'top left',
214
+ // Keep other relevant styles
215
+ borderRadius: '3px',
216
+ boxShadow: '0 1px 3px rgba(0,0,0,0.2)',
217
+ display: 'inline-block', // Or 'block' depending on desired layout flow
218
+ margin: '0 auto',
219
+ };
220
+ // --- END Canvas Logic ---
221
+
222
+ const confidenceLevel = region.confidence >= 0.8 ? 'high' : (region.confidence >= 0.5 ? 'medium' : 'low');
223
+ const confidenceText = region.confidence !== null && region.confidence !== undefined ? region.confidence.toFixed(2) : 'N/A';
224
+ const handleInputChange = (e) => {
225
+ onTextChange(pageIndex, regionIndex, e.target.value);
226
+ };
227
+ // const handleFocus = (e) => {
228
+ // e.target.select();
229
+ // };
230
+ const handleKeyDown = (e) => {
231
+ if (e.key === 'Enter') {
232
+ e.preventDefault();
233
+ onEnterPress(e.target);
234
+ }
235
+ };
236
+
237
+ // Convert to JSX
238
+ return (
239
+ <div className="region-item" data-region-id={region.id} data-confidence={region.confidence} data-modified={region.modified}>
240
+ <div className="confidence-cell" data-level={confidenceLevel}>
241
+ {confidenceText}
242
+ </div>
243
+
244
+ <div className="region-content-cell">
245
+ {/* Replace div with canvas */}
246
+ <canvas
247
+ ref={canvasRef}
248
+ style={canvasStyle}
249
+ // Set initial width/height perhaps? Or rely purely on useEffect
250
+ // width={sourceWidth} // Setting via useEffect is generally better
251
+ // height={sourceHeight}
252
+ className="image-clip-canvas" // Add a class for potential styling
253
+ ></canvas>
254
+ <div
255
+ ref={textRef}
256
+ contentEditable
257
+ // rows={Math.max(1, Math.ceil((region.corrected_text || '').length / 50))}
258
+ data-page-index={pageIndex}
259
+ data-region-index={regionIndex}
260
+ data-original-text={region.ocr_text}
261
+ // onInput={handleInputChange}
262
+ // onFocus={handleFocus}
263
+ // onKeyDown={handleKeyDown}
264
+ className={`editing-content ${region.modified ? 'modified' : ''}`}
265
+ // onInput={handleContentEditableChange} // Handle text changes
266
+ >
267
+ <CheckedText inputText={region.corrected_text} />
268
+ {/* <CheckedText inputText={region.ocr_text} /> */}
269
+ </div>
270
+ </div>
271
+ </div>
272
+ );
273
+ }
274
+
275
+ // --- Region Table Component ---
276
+ function RegionTable({ regions, imageUrl, pageData, pageIndex, onTextChange, onEnterPress }) {
277
+ // Ensure filtering is done using useMemo for efficiency
278
+ const validRegions = useMemo(() =>
279
+ (regions || []).filter(r => r && r.id && r.bbox && typeof r.ocr_text === 'string'),
280
+ [regions] // Recalculate only when regions array changes
281
+ );
282
+
283
+ if (!validRegions || validRegions.length === 0) {
284
+ // Convert to JSX
285
+ return <p>No valid OCR regions to display for this page section.</p>;
286
+ }
287
+
288
+ // Convert to JSX
289
+ return (
290
+ <div className="region-list">
291
+ {validRegions.map((region, rIndex) => (
292
+ <RegionRow
293
+ key={region.id}
294
+ region={region}
295
+ imageUrl={imageUrl}
296
+ pageData={pageData}
297
+ pageIndex={pageIndex}
298
+ regionIndex={rIndex}
299
+ onTextChange={onTextChange}
300
+ onEnterPress={onEnterPress}
301
+ />
302
+ ))}
303
+ </div>
304
+ );
305
+ }
306
+
307
+ // --- Main Application Component ---
308
+ function App() {
309
+ // State for the application
310
+ const [taskData, setTaskData] = useState(null); // Holds the parsed manifest.json
311
+ const [imageData, setImageData] = useState({}); // Holds { relativePath: objectURL }
312
+ const [isLoading, setIsLoading] = useState(false);
313
+ const [error, setError] = useState(null);
314
+ const [currentFilename, setCurrentFilename] = useState(''); // Name of the loaded zip
315
+
316
+ // --- File Handling ---
317
+ const handleFileChange = useCallback(async (event) => {
318
+ const file = event.target.files[0];
319
+ if (!file) return;
320
+
321
+ setIsLoading(true);
322
+ setError(null);
323
+ setTaskData(null);
324
+ setCurrentFilename(file.name);
325
+ // Revoke previous object URLs
326
+ Object.values(imageData).forEach(URL.revokeObjectURL);
327
+ setImageData({});
328
+
329
+ try {
330
+ const zip = await JSZip.loadAsync(file); // Load zip file
331
+
332
+ // 1. Load Manifest
333
+ const manifestFile = zip.file("manifest.json");
334
+ if (!manifestFile) {
335
+ throw new Error("manifest.json not found in the zip file.");
336
+ }
337
+ const manifestContent = await manifestFile.async("string");
338
+ const parsedManifest = JSON.parse(manifestContent);
339
+ // TODO: Add validation for manifest structure?
340
+ setTaskData(parsedManifest);
341
+
342
+ // 2. Load Images and create Object URLs
343
+ const imagePromises = [];
344
+ const newImageData = {};
345
+ zip.folder("images").forEach((relativePath, fileEntry) => {
346
+ if (!fileEntry.dir) {
347
+ imagePromises.push(
348
+ fileEntry.async("blob").then(blob => {
349
+ const objectURL = URL.createObjectURL(blob);
350
+ // Store URL mapped to the relative path used in manifest
351
+ newImageData[`images/${relativePath}`] = objectURL;
352
+ })
353
+ );
354
+ }
355
+ });
356
+
357
+ await Promise.all(imagePromises);
358
+ setImageData(newImageData);
359
+ console.log("Loaded images:", Object.keys(newImageData));
360
+
361
+
362
+ } catch (err) {
363
+ console.error("Error loading task package:", err);
364
+ setError(`Error loading task package: ${err.message}`);
365
+ setTaskData(null); // Clear data on error
366
+ setImageData({});
367
+ setCurrentFilename('');
368
+ } finally {
369
+ setIsLoading(false);
370
+ // Reset file input value so the same file can be loaded again
371
+ event.target.value = null;
372
+ }
373
+ }, [imageData]); // Depend on imageData to revoke old URLs
374
+
375
+ // --- Text Area Change Handler ---
376
+ const handleTextChange = (pageIndex, regionIndex, newText) => {
377
+ setTaskData(prevData => {
378
+ if (!prevData) return null; // Handle null state
379
+
380
+ const newData = { ...prevData }; // Shallow copy top-level object
381
+ const newPages = [...newData.pages]; // Create a shallow copy of pages array
382
+
383
+ // Ensure pageIndex and regionIndex are within bounds to prevent errors
384
+ if (newPages[pageIndex] && Array.isArray(newPages[pageIndex].regions)) {
385
+ const newRegions = [...newPages[pageIndex].regions]; // Shallow copy regions array
386
+ const region = newRegions[regionIndex];
387
+
388
+ // Update only the specific region's corrected_text and modified fields
389
+ if (region) {
390
+ region.corrected_text = newText;
391
+ region.modified = newText !== region.ocr_text;
392
+ newRegions[regionIndex] = region; // Reassign updated region to array
393
+
394
+ newPages[pageIndex].regions = newRegions; // Reassign updated regions array to page
395
+ }
396
+ }
397
+
398
+ newData.pages = newPages; // Reassign updated pages array to data object
399
+ return newData;
400
+ });
401
+ };
402
+
403
+ // --- Enter Key Navigation Handler ---
404
+ const handleEnterNavigation = (currentTextArea) => {
405
+ const allTextAreas = Array.from(document.querySelectorAll('.text-content-input'));
406
+ const currentIndex = allTextAreas.indexOf(currentTextArea);
407
+
408
+ if (currentIndex > -1 && currentIndex < allTextAreas.length - 1) {
409
+ const nextTextArea = allTextAreas[currentIndex + 1];
410
+ nextTextArea.focus();
411
+ }
412
+ };
413
+
414
+ // --- UI Rendering (Convert to JSX) ---
415
+ return (
416
+ <DictionaryProvider>
417
+ <div className="app-container">
418
+ <div className="task-loader">
419
+ <label htmlFor="zip-input">Load Correction Task Package (.zip): </label>
420
+ <input type="file" id="zip-input" accept=".zip" onChange={handleFileChange} disabled={isLoading} />
421
+ {isLoading && <span> Loading...</span>}
422
+ </div>
423
+
424
+ {error && <div className="error-message" style={{ color: 'red', margin: '10px', padding: '10px', border: '1px solid red' }}>{error}</div>}
425
+
426
+ {!isLoading && !taskData && !error && (
427
+ <div className="initial-message">
428
+ <p>Please load a .zip task package to begin.</p>
429
+ </div>
430
+ )}
431
+
432
+ {taskData && (
433
+ <div className="task-content">
434
+ <h2>Task: {currentFilename}</h2>
435
+ <p>PDF Source: {taskData.pdfs && taskData.pdfs.length > 0 ? taskData.pdfs[0].source : (taskData.pages[0]?.pdf_source || 'Unknown')} ({taskData.pages?.length || 0} pages)</p>
436
+
437
+ <div className="controls-container">
438
+ {/* TODO: Add Export Functionality */}
439
+ <button id="export-corrections" className="export-btn">Export Corrections JSON</button>
440
+ </div>
441
+
442
+ <div className="pages-container">
443
+ {taskData.pages.map((page, pIndex) => (
444
+ <div className="page-section" key={page.image_path}>
445
+ <div className="page-title">
446
+ <h3>Page {page.page_number} (Source: {page.pdf_short_id})</h3>
447
+ </div>
448
+ <RegionTable
449
+ regions={page.regions}
450
+ imageUrl={imageData[page.image_path]} /* Pass the object URL */
451
+ pageData={page} /* Pass page metadata (width, height) */
452
+ pageIndex={pIndex}
453
+ onTextChange={handleTextChange}
454
+ onEnterPress={handleEnterNavigation}
455
+ />
456
+ </div>
457
+ ))}
458
+ </div>
459
+ </div>
460
+ )}
461
+ </div>
462
+ </DictionaryProvider>
463
+ );
464
+ }
465
+
466
+ console.log("DEBUG: Mounting React app...");
467
+ // Mount the app to the DOM using ReactDOM.createRoot
468
+ const root = ReactDOM.createRoot(document.getElementById('app'));
469
+ root.render(<App />);
470
+ console.log("DEBUG: React app mount initiated.");
471
+
472
+ })(); // Immediately invoke the function