@tfw.in/structura-lib 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/PRODUCTION_ARCHITECTURE.md +511 -0
  2. package/README.md +379 -0
  3. package/SAVE_FUNCTIONALITY_COMPLETE.md +448 -0
  4. package/dist/cjs/EditableContent.js +150 -0
  5. package/dist/cjs/HtmlViewer.js +587 -0
  6. package/dist/cjs/PdfComponents.js +16 -0
  7. package/dist/cjs/PdfDocumentViewer.js +281 -0
  8. package/dist/cjs/Structura.js +806 -0
  9. package/dist/cjs/Table.js +164 -0
  10. package/dist/cjs/TableCell.js +115 -0
  11. package/dist/cjs/accuracyMetrics.js +39 -0
  12. package/dist/cjs/helpers/preprocessData.js +143 -0
  13. package/dist/cjs/index.js +7 -0
  14. package/dist/cjs/lib/polyfills.js +15 -0
  15. package/dist/cjs/lib/utils.js +10 -0
  16. package/dist/cjs/node_modules/react-icons/fa/index.esm.js +14 -0
  17. package/dist/cjs/node_modules/react-icons/lib/esm/iconBase.js +69 -0
  18. package/dist/cjs/node_modules/react-icons/lib/esm/iconContext.js +15 -0
  19. package/dist/cjs/polyfills.js +19 -0
  20. package/dist/cjs/route.js +102 -0
  21. package/dist/cjs/styles.css +7 -0
  22. package/dist/cjs/styles.css.map +1 -0
  23. package/dist/cjs/ui/badge.js +34 -0
  24. package/dist/cjs/ui/button.js +71 -0
  25. package/dist/cjs/ui/card.js +86 -0
  26. package/dist/cjs/ui/progress.js +45 -0
  27. package/dist/cjs/ui/scroll-area.js +62 -0
  28. package/dist/cjs/ui/tabs.js +60 -0
  29. package/dist/cjs/worker.js +36 -0
  30. package/dist/esm/EditableContent.js +161 -0
  31. package/dist/esm/HtmlViewer.js +640 -0
  32. package/dist/esm/PdfComponents.js +21 -0
  33. package/dist/esm/PdfDocumentViewer.js +294 -0
  34. package/dist/esm/Structura.js +951 -0
  35. package/dist/esm/Table.js +182 -0
  36. package/dist/esm/TableCell.js +122 -0
  37. package/dist/esm/_virtual/_rollupPluginBabelHelpers.js +305 -0
  38. package/dist/esm/accuracyMetrics.js +41 -0
  39. package/dist/esm/helpers/preprocessData.js +152 -0
  40. package/dist/esm/index.js +1 -0
  41. package/dist/esm/lib/polyfills.js +13 -0
  42. package/dist/esm/lib/utils.js +8 -0
  43. package/dist/esm/node_modules/react-icons/fa/index.esm.js +11 -0
  44. package/dist/esm/node_modules/react-icons/lib/esm/iconBase.js +66 -0
  45. package/dist/esm/node_modules/react-icons/lib/esm/iconContext.js +12 -0
  46. package/dist/esm/polyfills.js +17 -0
  47. package/dist/esm/route.js +154 -0
  48. package/dist/esm/styles.css +7 -0
  49. package/dist/esm/styles.css.map +1 -0
  50. package/dist/esm/types/EditableContent.d.ts +9 -0
  51. package/dist/esm/types/HtmlViewer.d.ts +10 -0
  52. package/dist/esm/types/PdfComponents.d.ts +35 -0
  53. package/dist/esm/types/PdfDocumentViewer.d.ts +22 -0
  54. package/dist/esm/types/Structura.d.ts +11 -0
  55. package/dist/esm/types/Table.d.ts +12 -0
  56. package/dist/esm/types/TableCell.d.ts +13 -0
  57. package/dist/esm/types/accuracy.d.ts +23 -0
  58. package/dist/esm/types/accuracyMetrics.d.ts +5 -0
  59. package/dist/esm/types/helpers/flattenJSON.d.ts +1 -0
  60. package/dist/esm/types/helpers/hardMerging.d.ts +2 -0
  61. package/dist/esm/types/helpers/index.d.ts +6 -0
  62. package/dist/esm/types/helpers/jsonToHtml.d.ts +40 -0
  63. package/dist/esm/types/helpers/preprocessData.d.ts +3 -0
  64. package/dist/esm/types/helpers/removeMetadata.d.ts +1 -0
  65. package/dist/esm/types/helpers/tableProcessor.d.ts +1 -0
  66. package/dist/esm/types/index.d.ts +3 -0
  67. package/dist/esm/types/lib/polyfills.d.ts +1 -0
  68. package/dist/esm/types/lib/utils.d.ts +2 -0
  69. package/dist/esm/types/polyfills.d.ts +1 -0
  70. package/dist/esm/types/route.d.ts +45 -0
  71. package/dist/esm/types/test-app/src/App.d.ts +4 -0
  72. package/dist/esm/types/test-app/src/main.d.ts +1 -0
  73. package/dist/esm/types/test-app/vite.config.d.ts +2 -0
  74. package/dist/esm/types/types.d.ts +23 -0
  75. package/dist/esm/types/ui/alert.d.ts +8 -0
  76. package/dist/esm/types/ui/badge.d.ts +9 -0
  77. package/dist/esm/types/ui/button.d.ts +11 -0
  78. package/dist/esm/types/ui/card.d.ts +8 -0
  79. package/dist/esm/types/ui/progress.d.ts +6 -0
  80. package/dist/esm/types/ui/scroll-area.d.ts +5 -0
  81. package/dist/esm/types/ui/skeleton.d.ts +2 -0
  82. package/dist/esm/types/ui/tabs.d.ts +7 -0
  83. package/dist/esm/types/worker.d.ts +1 -0
  84. package/dist/esm/ui/badge.js +31 -0
  85. package/dist/esm/ui/button.js +50 -0
  86. package/dist/esm/ui/card.js +67 -0
  87. package/dist/esm/ui/progress.js +26 -0
  88. package/dist/esm/ui/scroll-area.js +45 -0
  89. package/dist/esm/ui/tabs.js +39 -0
  90. package/dist/esm/worker.js +50 -0
  91. package/dist/index.d.ts +38 -0
  92. package/package.json +85 -0
  93. package/server/README.md +203 -0
  94. package/server/db.js +142 -0
  95. package/server/server.js +165 -0
@@ -0,0 +1,164 @@
1
+ 'use strict';
2
+
3
+ Object.defineProperty(exports, '__esModule', { value: true });
4
+
5
+ var jsxRuntime = require('react/jsx-runtime');
6
+ var React = require('react');
7
+ var TableCell = require('./TableCell.js');
8
+
9
+ // Utility function to clean HTML content
10
+ const cleanHtml = html => {
11
+ if (!html) return "";
12
+ // Log original HTML content
13
+ if (html.includes("strikethrough")) {
14
+ console.log("Table - Original HTML with strikethrough:", html);
15
+ }
16
+ // Remove newline characters
17
+ const cleanedHtml = html.replace(/\n/g, " ").replace(/<br\s*\/?>/g, " ")
18
+ // Replace non-standard strikethrough tag with standard HTML5 <s> tag
19
+ .replace(/<strikethrough>/g, "<s>").replace(/<\/strikethrough>/g, "</s>")
20
+ // Remove consecutive spaces
21
+ .replace(/\s+/g, " ")
22
+ // Trim leading and trailing spaces
23
+ .trim();
24
+ // Log transformed HTML content
25
+ if (html.includes("strikethrough")) {
26
+ console.log("Table - Transformed HTML with <s> tags:", cleanedHtml);
27
+ }
28
+ return cleanedHtml;
29
+ };
30
+ function Table({
31
+ node,
32
+ selectedBboxId,
33
+ onJsonClick,
34
+ onContentChange,
35
+ mergedTables = [],
36
+ hasLlmHtml = false,
37
+ showJsonIcons = true,
38
+ onNodeClick
39
+ }) {
40
+ const [useLlmHtml, setUseLlmHtml] = React.useState(false);
41
+ // Get the appropriate HTML content
42
+ const getHtmlContent = node => {
43
+ if (useLlmHtml && node.llm_table_html) {
44
+ return cleanHtml(node.llm_table_html);
45
+ }
46
+ return cleanHtml(node.html || "");
47
+ };
48
+ // Check if we should use individual TableCell contents
49
+ const shouldUseTableCellContents = () => {
50
+ // Check if any of the TableCell children has llm_table_html
51
+ if (!node.children) return false;
52
+ // TODO: to be fixed later
53
+ return node.children.some(child => child.block_type === "TableCell" && child.llm_table_html !== undefined);
54
+ };
55
+ // Process the main table
56
+ const processTable = tableNode => {
57
+ const tempDiv = document.createElement("div");
58
+ tempDiv.innerHTML = getHtmlContent(tableNode);
59
+ const tableElement = tempDiv.querySelector("table");
60
+ if (!tableElement) return null;
61
+ // Get all rows
62
+ const rows = Array.from(tableElement.querySelectorAll("tr"));
63
+ return {
64
+ rows,
65
+ tableElement,
66
+ tempDiv
67
+ };
68
+ };
69
+ const mainTable = processTable(node);
70
+ if (!mainTable) return null;
71
+ // Process merged tables if any
72
+ const mergedRows = [];
73
+ mergedTables.forEach(table => {
74
+ const processedTable = processTable(table);
75
+ if (processedTable) {
76
+ mergedRows.push(...processedTable.rows);
77
+ }
78
+ });
79
+ // Combine rows from main table and merged tables
80
+ const allRows = [...mainTable.rows, ...mergedRows];
81
+ // Map cells to children
82
+ const processedRows = allRows.map((row, rowIndex) => {
83
+ const cells = Array.from(row.querySelectorAll("th, td"));
84
+ return cells.map((cell, colIndex) => {
85
+ var _a;
86
+ const globalCellIndex = rowIndex * cells.length + colIndex;
87
+ const childNode = (_a = node.children) === null || _a === void 0 ? void 0 : _a[globalCellIndex];
88
+ let cellContent = cleanHtml(cell.innerHTML);
89
+ const isHeaderCell = cell.tagName.toLowerCase() === "th"; // Check if this is a header cell
90
+ // If we should use TableCell content and we have a childNode with llm_table_html
91
+ if (shouldUseTableCellContents() && childNode && childNode.llm_table_html !== undefined) {
92
+ cellContent = useLlmHtml && childNode.llm_table_html ? cleanHtml(childNode.llm_table_html) : cleanHtml(childNode.html || "");
93
+ }
94
+ // TODO: to be fixed later - only apply dubious highlighting if the cell has llm_table_html
95
+ const isDubious = (childNode === null || childNode === void 0 ? void 0 : childNode.dubious) && (childNode === null || childNode === void 0 ? void 0 : childNode.llm_table_html) !== undefined ? true : false;
96
+ return {
97
+ id: (childNode === null || childNode === void 0 ? void 0 : childNode.id) || `${node.id}-cell-${globalCellIndex}`,
98
+ content: cellContent,
99
+ isHeader: isHeaderCell,
100
+ isDubious: isDubious,
101
+ nodeData: childNode || {
102
+ id: `${node.id}-cell-${globalCellIndex}`,
103
+ html: cellContent,
104
+ block_type: "TableCell"
105
+ }
106
+ };
107
+ });
108
+ });
109
+ const handleCellContentChange = (cellId, newContent) => {
110
+ if (onContentChange) {
111
+ onContentChange(cellId, newContent);
112
+ }
113
+ };
114
+ const handleTableClick = e => {
115
+ // Only process clicks directly on the table container, not bubbled events from cells
116
+ if (e.target === e.currentTarget && node.id && onNodeClick) {
117
+ onNodeClick(node.id);
118
+ }
119
+ };
120
+ return jsxRuntime.jsxs("div", {
121
+ onClick: handleTableClick,
122
+ className: onNodeClick ? "cursor-pointer" : "",
123
+ children: [hasLlmHtml && jsxRuntime.jsx("div", {
124
+ className: "mb-2 flex justify-end",
125
+ onClick: e => e.stopPropagation(),
126
+ children: jsxRuntime.jsxs("label", {
127
+ className: "inline-flex items-center cursor-pointer",
128
+ children: [jsxRuntime.jsx("input", {
129
+ type: "checkbox",
130
+ checked: useLlmHtml,
131
+ onChange: () => setUseLlmHtml(!useLlmHtml),
132
+ className: "sr-only peer"
133
+ }), jsxRuntime.jsx("div", {
134
+ className: "relative w-11 h-6 bg-gray-200 peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-blue-300 rounded-full peer peer-checked:after:translate-x-full rtl:peer-checked:after:-translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:start-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-blue-600"
135
+ }), jsxRuntime.jsx("span", {
136
+ className: "ms-3 text-sm font-medium text-gray-900",
137
+ children: "Agentic"
138
+ })]
139
+ })
140
+ }), jsxRuntime.jsx("div", {
141
+ className: "w-full overflow-x-auto",
142
+ children: jsxRuntime.jsx("table", {
143
+ className: "min-w-full divide-y divide-gray-200",
144
+ children: jsxRuntime.jsx("tbody", {
145
+ children: processedRows.map((row, rowIndex) => jsxRuntime.jsx("tr", {
146
+ children: row.map(cell => jsxRuntime.jsx(TableCell.default, {
147
+ id: cell.id,
148
+ content: cell.content,
149
+ onJsonClick: () => onJsonClick(cell.nodeData),
150
+ isSelected: cell.id === selectedBboxId,
151
+ isHeader: cell.isHeader,
152
+ isDubious: cell.isDubious,
153
+ onContentChange: onContentChange ? newContent => handleCellContentChange(cell.id, newContent) : undefined,
154
+ showJsonIcons: showJsonIcons,
155
+ onNodeClick: onNodeClick ? () => onNodeClick(cell.id) : undefined
156
+ }, cell.id))
157
+ }, rowIndex))
158
+ })
159
+ })
160
+ })]
161
+ });
162
+ }
163
+
164
+ exports.default = Table;
@@ -0,0 +1,115 @@
1
+ 'use strict';
2
+
3
+ Object.defineProperty(exports, '__esModule', { value: true });
4
+
5
+ var jsxRuntime = require('react/jsx-runtime');
6
+ var React = require('react');
7
+ var vsc = require('react-icons/vsc');
8
+
9
+ // Utility function to clean HTML content
10
+ const cleanHtml = html => {
11
+ if (!html) return "";
12
+ // Log original HTML content
13
+ if (html.includes("strikethrough")) {
14
+ console.log("TableCell - Original HTML with strikethrough:", html);
15
+ }
16
+ // Remove newline characters
17
+ const cleanedHtml = html.replace(/\n/g, " ").replace(/<br\s*\/?>/g, " ")
18
+ // Replace non-standard strikethrough tag with standard HTML5 <s> tag
19
+ .replace(/<strikethrough>/g, "<s>").replace(/<\/strikethrough>/g, "</s>")
20
+ // Remove consecutive spaces
21
+ .replace(/\s+/g, " ")
22
+ // Trim leading and trailing spaces
23
+ .trim();
24
+ // Log transformed HTML content
25
+ if (html.includes("strikethrough")) {
26
+ console.log("TableCell - Transformed HTML with <s> tags:", cleanedHtml);
27
+ }
28
+ return cleanedHtml;
29
+ };
30
+ function TableCell({
31
+ id,
32
+ content,
33
+ onJsonClick,
34
+ isSelected,
35
+ isHeader = false,
36
+ onContentChange,
37
+ showJsonIcons = true,
38
+ onNodeClick,
39
+ isDubious = false
40
+ }) {
41
+ const [isEditing, setIsEditing] = React.useState(false);
42
+ const [editedContent, setEditedContent] = React.useState(cleanHtml(content));
43
+ const [isEdited, setIsEdited] = React.useState(false);
44
+ const CellComponent = isHeader ? "th" : "td";
45
+ // Update content when props change, but preserve edited state
46
+ React.useEffect(() => {
47
+ if (!isEdited) {
48
+ setEditedContent(cleanHtml(content));
49
+ }
50
+ }, [content, isEdited]);
51
+ // Log content that contains strikethrough when component renders
52
+ React.useEffect(() => {
53
+ if (content.includes("strikethrough")) {
54
+ console.log("TableCell rendering - Cell ID:", id);
55
+ console.log("Original content:", content);
56
+ console.log("Edited content being displayed:", editedContent);
57
+ }
58
+ }, [id, content, editedContent]);
59
+ const handleDoubleClick = () => {
60
+ if (onContentChange) {
61
+ setIsEditing(true);
62
+ }
63
+ };
64
+ const handleBlur = () => {
65
+ setIsEditing(false);
66
+ if (editedContent !== content && onContentChange) {
67
+ setIsEdited(true);
68
+ onContentChange(id, editedContent);
69
+ }
70
+ };
71
+ const handleClick = e => {
72
+ if (!isEditing && onNodeClick) {
73
+ onNodeClick();
74
+ }
75
+ };
76
+ return jsxRuntime.jsx(CellComponent, {
77
+ id: id,
78
+ className: `structura-px-4 structura-py-2 structura-border structura-border-gray-200 structura-transition-colors structura-relative structura-group
79
+ ${isSelected ? "structura-bg-blue-100 structura-border-2 structura-border-blue-500" : ""}
80
+ ${isHeader ? "structura-bg-gray-100 structura-font-bold structura-text-gray-800" : ""}
81
+ ${isEdited ? "structura-bg-yellow-100" : ""}
82
+ ${isDubious ? "structura-bg-red-500/20" : ""}
83
+ ${onNodeClick && !isEditing ? "structura-cursor-pointer hover:structura-bg-gray-50" : ""}`,
84
+ onDoubleClick: handleDoubleClick,
85
+ onClick: handleClick,
86
+ children: jsxRuntime.jsxs("div", {
87
+ className: "structura-w-full structura-relative",
88
+ children: [showJsonIcons && jsxRuntime.jsx("button", {
89
+ onClick: e => {
90
+ e.stopPropagation();
91
+ onJsonClick();
92
+ },
93
+ className: "structura-absolute -structura-right-1 structura-top-0 structura-opacity-0 group-hover:structura-opacity-100 structura-transition-opacity structura-text-gray-500 hover:structura-text-gray-700",
94
+ title: "View JSON",
95
+ children: jsxRuntime.jsx(vsc.VscJson, {
96
+ size: 16
97
+ })
98
+ }), isEditing ? jsxRuntime.jsx("textarea", {
99
+ value: editedContent,
100
+ onChange: e => setEditedContent(e.target.value),
101
+ onBlur: handleBlur,
102
+ autoFocus: true,
103
+ className: "structura-w-full structura-min-h-[24px] structura-p-1 structura-border structura-border-blue-400 structura-rounded focus:structura-outline-none focus:structura-ring-2 focus:structura-ring-blue-500",
104
+ onClick: e => e.stopPropagation()
105
+ }) : jsxRuntime.jsx("div", {
106
+ dangerouslySetInnerHTML: {
107
+ __html: editedContent
108
+ },
109
+ className: "structura-min-w-0 structura-break-words"
110
+ })]
111
+ })
112
+ });
113
+ }
114
+
115
+ exports.default = TableCell;
@@ -0,0 +1,39 @@
1
+ 'use strict';
2
+
3
+ const calculateDifferences = (original, updated) => {
4
+ const cleanText = text => text.replace(/<[^>]*>/g, "").trim();
5
+ const originalClean = cleanText(original);
6
+ const updatedClean = cleanText(updated);
7
+ const originalWords = originalClean.split(/\s+/);
8
+ const updatedWords = updatedClean.split(/\s+/);
9
+ return {
10
+ charactersDifferent: Math.abs(updatedClean.length - originalClean.length),
11
+ wordsDifferent: Math.abs(updatedWords.length - originalWords.length),
12
+ // Levenshtein distance for more accurate character difference
13
+ editDistance: levenshteinDistance(originalClean, updatedClean)
14
+ };
15
+ };
16
+ function levenshteinDistance(str1, str2) {
17
+ const m = str1.length;
18
+ const n = str2.length;
19
+ const dp = Array(m + 1).fill(0).map(() => Array(n + 1).fill(0));
20
+ for (let i = 0; i <= m; i++) dp[i][0] = i;
21
+ for (let j = 0; j <= n; j++) dp[0][j] = j;
22
+ for (let i = 1; i <= m; i++) {
23
+ for (let j = 1; j <= n; j++) {
24
+ if (str1[i - 1] === str2[j - 1]) {
25
+ dp[i][j] = dp[i - 1][j - 1];
26
+ } else {
27
+ dp[i][j] = 1 + Math.min(dp[i - 1][j],
28
+ // deletion
29
+ dp[i][j - 1],
30
+ // insertion
31
+ dp[i - 1][j - 1] // substitution
32
+ );
33
+ }
34
+ }
35
+ }
36
+ return dp[m][n];
37
+ }
38
+
39
+ exports.calculateDifferences = calculateDifferences;
@@ -0,0 +1,143 @@
1
+ 'use strict';
2
+
3
+ // Helper function to sort table cells by page number and bounding box coordinates
4
+ function sortTableCells(tableCells) {
5
+ // console.log("Table cells to sort:", tableCells);
6
+ // Helper function to extract page number and block number from ID (e.g., '/page/2/TableCell/10')
7
+ const getPageAndBlockNumber = id => {
8
+ const match = id.match(/^\/page\/(\d+)\/TableCell\/(\d+)$/);
9
+ if (match) {
10
+ return [parseInt(match[1]), parseInt(match[2])];
11
+ }
12
+ return [0, 0]; // Default to (0,0) if pattern not found
13
+ };
14
+ // Sort by page number first, then by block number
15
+ tableCells.sort((a, b) => {
16
+ const [pageA, blockA] = getPageAndBlockNumber(a.id);
17
+ const [pageB, blockB] = getPageAndBlockNumber(b.id);
18
+ return pageA !== pageB ? pageA - pageB : blockA - blockB;
19
+ });
20
+ // console.log("Table cells sorted:", tableCells);
21
+ return tableCells;
22
+ }
23
+ // Strikethrough correction function
24
+ function strikethroughCorrection(jsonData) {
25
+ // Deep clone to avoid mutating the original
26
+ const modifiedJson = JSON.parse(JSON.stringify(jsonData));
27
+ // Helper function to process HTML content
28
+ const processHtml = html => {
29
+ if (!html) return html;
30
+ return html.replace(/<strikethrough>/g, "<del>").replace(/<\/strikethrough>/g, "</del>");
31
+ };
32
+ // Helper function to recursively process nodes
33
+ const processNode = node => {
34
+ // Process HTML content if it exists
35
+ if (node.html) {
36
+ node.html = processHtml(node.html);
37
+ }
38
+ // Process LLM table HTML if it exists
39
+ if (node.llm_table_html) {
40
+ node.llm_table_html = processHtml(node.llm_table_html);
41
+ }
42
+ // Process children recursively
43
+ if (node.children && Array.isArray(node.children)) {
44
+ node.children.forEach(processNode);
45
+ }
46
+ };
47
+ // Start processing from the root
48
+ processNode(modifiedJson);
49
+ return modifiedJson;
50
+ }
51
+ // Remove picture nodes function
52
+ function removePictureNodes(jsonData) {
53
+ // Deep clone to avoid mutating the original
54
+ const modifiedJson = JSON.parse(JSON.stringify(jsonData));
55
+ // Helper function to recursively filter out picture nodes
56
+ const processPictureNodes = node => {
57
+ // If the node has children, filter out Picture nodes and process remaining children
58
+ if (node.children && Array.isArray(node.children)) {
59
+ node.children = node.children.filter(child => child.block_type !== "Picture").map(processPictureNodes);
60
+ }
61
+ return node;
62
+ };
63
+ // Start processing from the root
64
+ processPictureNodes(modifiedJson);
65
+ return modifiedJson;
66
+ }
67
+ // MAP 'html' and 'llm_table_html' (if present) of block_type 'Table' to their respective children i.e 'TableCell'
68
+ function mapTableChildren(jsonData) {
69
+ // Skip if there's no JSON data or no pages
70
+ if (!jsonData || !jsonData.children) {
71
+ console.log("No valid JSON data to process");
72
+ return jsonData;
73
+ }
74
+ // Create a deep copy to avoid modifying the original
75
+ const modifiedJsonData = JSON.parse(JSON.stringify(jsonData));
76
+ // Process each page in the document
77
+ modifiedJsonData.children = modifiedJsonData.children.map(page => {
78
+ // Skip if page has no children
79
+ if (!page.children) {
80
+ return page;
81
+ }
82
+ // Process each block in the page
83
+ page.children = page.children.map(block => {
84
+ // Only process Table and TableOfContents blocks
85
+ if (block.block_type === "Table" || block.block_type === "TableOfContents") {
86
+ // Skip if block has no children or no HTML
87
+ if (!block.children || !block.html) {
88
+ return block;
89
+ }
90
+ // Get all TableCell children and sort them by position
91
+ const tableCells = sortTableCells(block.children.filter(cell => cell.block_type === "TableCell"));
92
+ // Skip if no table cells found
93
+ if (tableCells.length === 0) {
94
+ return block;
95
+ }
96
+ // Create DOM parser to extract td/th elements from table HTML
97
+ const parser = new DOMParser();
98
+ // Process main HTML
99
+ const htmlDoc = parser.parseFromString(block.html, "text/html");
100
+ const htmlCells = Array.from(htmlDoc.querySelectorAll("td, th"));
101
+ // Process LLM table HTML if it exists
102
+ let llmHtmlCells = [];
103
+ if (block.llm_table_html) {
104
+ const llmHtmlDoc = parser.parseFromString(block.llm_table_html, "text/html");
105
+ llmHtmlCells = Array.from(llmHtmlDoc.querySelectorAll("td, th"));
106
+ }
107
+ // Safety check: log warning if HTML cells count doesn't match TableCell nodes
108
+ if (htmlCells.length !== tableCells.length) {
109
+ console.warn(`Warning: Cell count mismatch in ${block.block_type} ${block.id}. ` + `JSON has ${tableCells.length} cells, HTML has ${htmlCells.length} cells.`);
110
+ }
111
+ // Map HTML content to table cells
112
+ const maxCells = Math.min(tableCells.length, htmlCells.length);
113
+ for (let i = 0; i < maxCells; i++) {
114
+ // Update the table cell's HTML with content from the HTML table
115
+ tableCells[i].html = htmlCells[i].outerHTML;
116
+ // If LLM table HTML exists and has matching cell, update that too
117
+ if (block.llm_table_html && i < llmHtmlCells.length) {
118
+ tableCells[i].llm_table_html = llmHtmlCells[i].outerHTML;
119
+ }
120
+ }
121
+ }
122
+ return block;
123
+ });
124
+ return page;
125
+ });
126
+ console.log("Finished mapping table HTML to table cells");
127
+ return modifiedJsonData;
128
+ }
129
+ function preprocessData(jsonData) {
130
+ // Replace 'strikethrough' with 'del'
131
+ let processedJson = strikethroughCorrection(jsonData);
132
+ // Remove picture nodes
133
+ processedJson = removePictureNodes(processedJson);
134
+ // TODO: Uncomment this after correcting the the table enrichment and mapping.
135
+ // // Enrich tables with missing cells
136
+ // processedJson = processTablesForEnrichment(processedJson);
137
+ // // Map table HTML to table cells
138
+ processedJson = mapTableChildren(processedJson);
139
+ return processedJson;
140
+ }
141
+
142
+ exports.preprocessData = preprocessData;
143
+ exports.sortTableCells = sortTableCells;
@@ -0,0 +1,7 @@
1
+ 'use strict';
2
+
3
+ var Structura = require('./Structura.js');
4
+
5
+
6
+
7
+ exports.Structura = Structura.default;
@@ -0,0 +1,15 @@
1
+ 'use strict';
2
+
3
+ // Polyfill for Promise.withResolvers
4
+ // This is needed for Next.js build compatibility
5
+ if (typeof Promise.withResolvers !== 'function') {
6
+ Promise.withResolvers = function () {
7
+ let resolve;
8
+ let reject;
9
+ const promise = new Promise((res, rej) => {
10
+ resolve = res;
11
+ reject = rej;
12
+ });
13
+ return { promise, resolve, reject };
14
+ };
15
+ }
@@ -0,0 +1,10 @@
1
+ 'use strict';
2
+
3
+ var clsx = require('clsx');
4
+ var tailwindMerge = require('tailwind-merge');
5
+
6
+ function cn(...inputs) {
7
+ return tailwindMerge.twMerge(clsx(inputs));
8
+ }
9
+
10
+ exports.cn = cn;
@@ -0,0 +1,14 @@
1
+ 'use strict';
2
+
3
+ var iconBase = require('../lib/esm/iconBase.js');
4
+ require('../lib/esm/iconContext.js');
5
+
6
+ // THIS FILE IS AUTO GENERATED
7
+ function FaChartBar (props) {
8
+ return iconBase.GenIcon({"attr":{"viewBox":"0 0 512 512"},"child":[{"tag":"path","attr":{"d":"M332.8 320h38.4c6.4 0 12.8-6.4 12.8-12.8V172.8c0-6.4-6.4-12.8-12.8-12.8h-38.4c-6.4 0-12.8 6.4-12.8 12.8v134.4c0 6.4 6.4 12.8 12.8 12.8zm96 0h38.4c6.4 0 12.8-6.4 12.8-12.8V76.8c0-6.4-6.4-12.8-12.8-12.8h-38.4c-6.4 0-12.8 6.4-12.8 12.8v230.4c0 6.4 6.4 12.8 12.8 12.8zm-288 0h38.4c6.4 0 12.8-6.4 12.8-12.8v-70.4c0-6.4-6.4-12.8-12.8-12.8h-38.4c-6.4 0-12.8 6.4-12.8 12.8v70.4c0 6.4 6.4 12.8 12.8 12.8zm96 0h38.4c6.4 0 12.8-6.4 12.8-12.8V108.8c0-6.4-6.4-12.8-12.8-12.8h-38.4c-6.4 0-12.8 6.4-12.8 12.8v198.4c0 6.4 6.4 12.8 12.8 12.8zM496 384H64V80c0-8.84-7.16-16-16-16H16C7.16 64 0 71.16 0 80v336c0 17.67 14.33 32 32 32h464c8.84 0 16-7.16 16-16v-32c0-8.84-7.16-16-16-16z"}}]})(props);
9
+ }function FaFileDownload (props) {
10
+ return iconBase.GenIcon({"attr":{"viewBox":"0 0 384 512"},"child":[{"tag":"path","attr":{"d":"M224 136V0H24C10.7 0 0 10.7 0 24v464c0 13.3 10.7 24 24 24h336c13.3 0 24-10.7 24-24V160H248c-13.2 0-24-10.8-24-24zm76.45 211.36l-96.42 95.7c-6.65 6.61-17.39 6.61-24.04 0l-96.42-95.7C73.42 337.29 80.54 320 94.82 320H160v-80c0-8.84 7.16-16 16-16h32c8.84 0 16 7.16 16 16v80h65.18c14.28 0 21.4 17.29 11.27 27.36zM377 105L279.1 7c-4.5-4.5-10.6-7-17-7H256v128h128v-6.1c0-6.3-2.5-12.4-7-16.9z"}}]})(props);
11
+ }
12
+
13
+ exports.FaChartBar = FaChartBar;
14
+ exports.FaFileDownload = FaFileDownload;
@@ -0,0 +1,69 @@
1
+ 'use strict';
2
+
3
+ var React = require('react');
4
+ var iconContext = require('./iconContext.js');
5
+
6
+ var __assign = undefined && undefined.__assign || function () {
7
+ __assign = Object.assign || function (t) {
8
+ for (var s, i = 1, n = arguments.length; i < n; i++) {
9
+ s = arguments[i];
10
+ for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p];
11
+ }
12
+ return t;
13
+ };
14
+ return __assign.apply(this, arguments);
15
+ };
16
+ var __rest = undefined && undefined.__rest || function (s, e) {
17
+ var t = {};
18
+ for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0) t[p] = s[p];
19
+ if (s != null && typeof Object.getOwnPropertySymbols === "function") for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {
20
+ if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i])) t[p[i]] = s[p[i]];
21
+ }
22
+ return t;
23
+ };
24
+ function Tree2Element(tree) {
25
+ return tree && tree.map(function (node, i) {
26
+ return React.createElement(node.tag, __assign({
27
+ key: i
28
+ }, node.attr), Tree2Element(node.child));
29
+ });
30
+ }
31
+ function GenIcon(data) {
32
+ // eslint-disable-next-line react/display-name
33
+ return function (props) {
34
+ return React.createElement(IconBase, __assign({
35
+ attr: __assign({}, data.attr)
36
+ }, props), Tree2Element(data.child));
37
+ };
38
+ }
39
+ function IconBase(props) {
40
+ var elem = function (conf) {
41
+ var attr = props.attr,
42
+ size = props.size,
43
+ title = props.title,
44
+ svgProps = __rest(props, ["attr", "size", "title"]);
45
+ var computedSize = size || conf.size || "1em";
46
+ var className;
47
+ if (conf.className) className = conf.className;
48
+ if (props.className) className = (className ? className + " " : "") + props.className;
49
+ return React.createElement("svg", __assign({
50
+ stroke: "currentColor",
51
+ fill: "currentColor",
52
+ strokeWidth: "0"
53
+ }, conf.attr, attr, svgProps, {
54
+ className: className,
55
+ style: __assign(__assign({
56
+ color: props.color || conf.color
57
+ }, conf.style), props.style),
58
+ height: computedSize,
59
+ width: computedSize,
60
+ xmlns: "http://www.w3.org/2000/svg"
61
+ }), title && React.createElement("title", null, title), props.children);
62
+ };
63
+ return iconContext.IconContext !== undefined ? React.createElement(iconContext.IconContext.Consumer, null, function (conf) {
64
+ return elem(conf);
65
+ }) : elem(iconContext.DefaultContext);
66
+ }
67
+
68
+ exports.GenIcon = GenIcon;
69
+ exports.IconBase = IconBase;
@@ -0,0 +1,15 @@
1
+ 'use strict';
2
+
3
+ var React = require('react');
4
+
5
+ var DefaultContext = {
6
+ color: undefined,
7
+ size: undefined,
8
+ className: undefined,
9
+ style: undefined,
10
+ attr: undefined
11
+ };
12
+ var IconContext = React.createContext && React.createContext(DefaultContext);
13
+
14
+ exports.DefaultContext = DefaultContext;
15
+ exports.IconContext = IconContext;
@@ -0,0 +1,19 @@
1
+ 'use strict';
2
+
3
+ // Polyfill for Promise.withResolvers
4
+ // This is needed for Next.js build compatibility
5
+ if (typeof Promise.withResolvers !== 'function') {
6
+ Promise.withResolvers = function () {
7
+ let resolve;
8
+ let reject;
9
+ const promise = new Promise((res, rej) => {
10
+ resolve = res;
11
+ reject = rej;
12
+ });
13
+ return {
14
+ promise,
15
+ resolve,
16
+ reject
17
+ };
18
+ };
19
+ }