rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,799 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Table Parser
|
|
3
|
+
|
|
4
|
+
Deep parsing of tables from documents.
|
|
5
|
+
Extracts structure, headers, cells, and enables cell-level retrieval.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- Table structure extraction (rows, columns, headers)
|
|
9
|
+
- Cell-level retrieval (answer questions about specific cells)
|
|
10
|
+
- SQL-like queries over extracted table data
|
|
11
|
+
- Support for merged cells, multi-level headers
|
|
12
|
+
|
|
13
|
+
Integrates with the ingestion pipeline to extract tables from PDFs.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from typing import Any, Literal
|
|
22
|
+
from uuid import uuid4
|
|
23
|
+
|
|
24
|
+
import structlog
|
|
25
|
+
|
|
26
|
+
logger = structlog.get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# =============================================================================
|
|
30
|
+
# Data Models
|
|
31
|
+
# =============================================================================
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class TableCell:
|
|
36
|
+
"""A single cell in a table."""
|
|
37
|
+
|
|
38
|
+
row: int
|
|
39
|
+
col: int
|
|
40
|
+
value: str
|
|
41
|
+
|
|
42
|
+
# Cell properties
|
|
43
|
+
is_header: bool = False
|
|
44
|
+
is_merged: bool = False
|
|
45
|
+
rowspan: int = 1
|
|
46
|
+
colspan: int = 1
|
|
47
|
+
|
|
48
|
+
# Type inference
|
|
49
|
+
data_type: Literal["text", "number", "currency", "percentage", "date", "empty"] = "text"
|
|
50
|
+
numeric_value: float | None = None
|
|
51
|
+
|
|
52
|
+
def to_dict(self) -> dict[str, Any]:
|
|
53
|
+
"""Convert to dictionary."""
|
|
54
|
+
return {
|
|
55
|
+
"row": self.row,
|
|
56
|
+
"col": self.col,
|
|
57
|
+
"value": self.value,
|
|
58
|
+
"is_header": self.is_header,
|
|
59
|
+
"is_merged": self.is_merged,
|
|
60
|
+
"rowspan": self.rowspan,
|
|
61
|
+
"colspan": self.colspan,
|
|
62
|
+
"data_type": self.data_type,
|
|
63
|
+
"numeric_value": self.numeric_value,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class TableRow:
|
|
69
|
+
"""A row in a table."""
|
|
70
|
+
|
|
71
|
+
index: int
|
|
72
|
+
cells: list[TableCell] = field(default_factory=list)
|
|
73
|
+
is_header_row: bool = False
|
|
74
|
+
|
|
75
|
+
def to_dict(self) -> dict[str, Any]:
|
|
76
|
+
"""Convert to dictionary."""
|
|
77
|
+
return {
|
|
78
|
+
"index": self.index,
|
|
79
|
+
"cells": [c.to_dict() for c in self.cells],
|
|
80
|
+
"is_header_row": self.is_header_row,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
def get_values(self) -> list[str]:
|
|
84
|
+
"""Get all cell values in row."""
|
|
85
|
+
return [c.value for c in self.cells]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class TableColumn:
|
|
90
|
+
"""A column in a table."""
|
|
91
|
+
|
|
92
|
+
index: int
|
|
93
|
+
header: str = ""
|
|
94
|
+
data_type: str = "text"
|
|
95
|
+
cells: list[TableCell] = field(default_factory=list)
|
|
96
|
+
|
|
97
|
+
def to_dict(self) -> dict[str, Any]:
|
|
98
|
+
"""Convert to dictionary."""
|
|
99
|
+
return {
|
|
100
|
+
"index": self.index,
|
|
101
|
+
"header": self.header,
|
|
102
|
+
"data_type": self.data_type,
|
|
103
|
+
"cell_count": len(self.cells),
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
def get_values(self) -> list[str]:
|
|
107
|
+
"""Get all cell values in column."""
|
|
108
|
+
return [c.value for c in self.cells]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class ParsedTable:
|
|
113
|
+
"""A fully parsed table."""
|
|
114
|
+
|
|
115
|
+
id: str = field(default_factory=lambda: f"table_{str(uuid4())[:8]}")
|
|
116
|
+
|
|
117
|
+
# Source information
|
|
118
|
+
doc_id: str = ""
|
|
119
|
+
page_num: int | None = None
|
|
120
|
+
node_id: str = ""
|
|
121
|
+
|
|
122
|
+
# Table metadata
|
|
123
|
+
title: str = ""
|
|
124
|
+
caption: str = ""
|
|
125
|
+
|
|
126
|
+
# Structure
|
|
127
|
+
rows: list[TableRow] = field(default_factory=list)
|
|
128
|
+
columns: list[TableColumn] = field(default_factory=list)
|
|
129
|
+
|
|
130
|
+
# Dimensions
|
|
131
|
+
num_rows: int = 0
|
|
132
|
+
num_cols: int = 0
|
|
133
|
+
header_rows: int = 1
|
|
134
|
+
|
|
135
|
+
# Content
|
|
136
|
+
headers: list[str] = field(default_factory=list)
|
|
137
|
+
|
|
138
|
+
def to_dict(self) -> dict[str, Any]:
|
|
139
|
+
"""Convert to dictionary."""
|
|
140
|
+
return {
|
|
141
|
+
"id": self.id,
|
|
142
|
+
"doc_id": self.doc_id,
|
|
143
|
+
"page_num": self.page_num,
|
|
144
|
+
"node_id": self.node_id,
|
|
145
|
+
"title": self.title,
|
|
146
|
+
"caption": self.caption,
|
|
147
|
+
"num_rows": self.num_rows,
|
|
148
|
+
"num_cols": self.num_cols,
|
|
149
|
+
"header_rows": self.header_rows,
|
|
150
|
+
"headers": self.headers,
|
|
151
|
+
"rows": [r.to_dict() for r in self.rows],
|
|
152
|
+
"columns": [c.to_dict() for c in self.columns],
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
def get_cell(self, row: int, col: int) -> TableCell | None:
|
|
156
|
+
"""Get a specific cell."""
|
|
157
|
+
if 0 <= row < len(self.rows):
|
|
158
|
+
row_obj = self.rows[row]
|
|
159
|
+
if 0 <= col < len(row_obj.cells):
|
|
160
|
+
return row_obj.cells[col]
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
def get_row(self, index: int) -> TableRow | None:
|
|
164
|
+
"""Get a specific row."""
|
|
165
|
+
if 0 <= index < len(self.rows):
|
|
166
|
+
return self.rows[index]
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
def get_column(self, index: int) -> TableColumn | None:
|
|
170
|
+
"""Get a specific column."""
|
|
171
|
+
if 0 <= index < len(self.columns):
|
|
172
|
+
return self.columns[index]
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
def get_column_by_header(self, header: str) -> TableColumn | None:
|
|
176
|
+
"""Get column by header name."""
|
|
177
|
+
header_lower = header.lower()
|
|
178
|
+
for col in self.columns:
|
|
179
|
+
if col.header.lower() == header_lower:
|
|
180
|
+
return col
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
def to_markdown(self) -> str:
|
|
184
|
+
"""Convert table to markdown format."""
|
|
185
|
+
if not self.rows:
|
|
186
|
+
return ""
|
|
187
|
+
|
|
188
|
+
lines = []
|
|
189
|
+
|
|
190
|
+
# Title
|
|
191
|
+
if self.title:
|
|
192
|
+
lines.append(f"**{self.title}**\n")
|
|
193
|
+
|
|
194
|
+
# Headers
|
|
195
|
+
if self.headers:
|
|
196
|
+
lines.append("| " + " | ".join(self.headers) + " |")
|
|
197
|
+
lines.append("|" + "|".join(["---"] * len(self.headers)) + "|")
|
|
198
|
+
|
|
199
|
+
# Data rows
|
|
200
|
+
for row in self.rows:
|
|
201
|
+
if not row.is_header_row:
|
|
202
|
+
values = [c.value for c in row.cells]
|
|
203
|
+
lines.append("| " + " | ".join(values) + " |")
|
|
204
|
+
|
|
205
|
+
# Caption
|
|
206
|
+
if self.caption:
|
|
207
|
+
lines.append(f"\n*{self.caption}*")
|
|
208
|
+
|
|
209
|
+
return "\n".join(lines)
|
|
210
|
+
|
|
211
|
+
def to_csv(self) -> str:
|
|
212
|
+
"""Convert table to CSV format."""
|
|
213
|
+
lines = []
|
|
214
|
+
|
|
215
|
+
for row in self.rows:
|
|
216
|
+
values = []
|
|
217
|
+
for cell in row.cells:
|
|
218
|
+
# Escape commas and quotes
|
|
219
|
+
value = cell.value.replace('"', '""')
|
|
220
|
+
if ',' in value or '"' in value:
|
|
221
|
+
value = f'"{value}"'
|
|
222
|
+
values.append(value)
|
|
223
|
+
lines.append(",".join(values))
|
|
224
|
+
|
|
225
|
+
return "\n".join(lines)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# =============================================================================
|
|
229
|
+
# Table Parser
|
|
230
|
+
# =============================================================================
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class TableParser:
|
|
234
|
+
"""
|
|
235
|
+
Parses tables from various formats.
|
|
236
|
+
|
|
237
|
+
Supports:
|
|
238
|
+
- Plain text tables (markdown, ASCII)
|
|
239
|
+
- HTML tables
|
|
240
|
+
- PDF table extraction (via PyMuPDF)
|
|
241
|
+
"""
|
|
242
|
+
|
|
243
|
+
def __init__(
|
|
244
|
+
self,
|
|
245
|
+
infer_types: bool = True,
|
|
246
|
+
detect_headers: bool = True,
|
|
247
|
+
):
|
|
248
|
+
"""
|
|
249
|
+
Initialize the table parser.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
infer_types: Whether to infer cell data types.
|
|
253
|
+
detect_headers: Whether to detect header rows.
|
|
254
|
+
"""
|
|
255
|
+
self.infer_types = infer_types
|
|
256
|
+
self.detect_headers = detect_headers
|
|
257
|
+
|
|
258
|
+
def parse_from_text(
|
|
259
|
+
self,
|
|
260
|
+
text: str,
|
|
261
|
+
doc_id: str = "",
|
|
262
|
+
page_num: int | None = None,
|
|
263
|
+
node_id: str = "",
|
|
264
|
+
) -> list[ParsedTable]:
|
|
265
|
+
"""
|
|
266
|
+
Parse tables from plain text.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
text: Text containing table(s).
|
|
270
|
+
doc_id: Document ID.
|
|
271
|
+
page_num: Page number.
|
|
272
|
+
node_id: Node ID.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
List of ParsedTable objects.
|
|
276
|
+
"""
|
|
277
|
+
tables = []
|
|
278
|
+
|
|
279
|
+
# Try markdown table parsing
|
|
280
|
+
markdown_tables = self._parse_markdown_tables(text)
|
|
281
|
+
|
|
282
|
+
for raw_table in markdown_tables:
|
|
283
|
+
parsed = self._process_raw_table(
|
|
284
|
+
raw_table,
|
|
285
|
+
doc_id=doc_id,
|
|
286
|
+
page_num=page_num,
|
|
287
|
+
node_id=node_id,
|
|
288
|
+
)
|
|
289
|
+
if parsed:
|
|
290
|
+
tables.append(parsed)
|
|
291
|
+
|
|
292
|
+
# Try ASCII table parsing if no markdown found
|
|
293
|
+
if not tables:
|
|
294
|
+
ascii_tables = self._parse_ascii_tables(text)
|
|
295
|
+
for raw_table in ascii_tables:
|
|
296
|
+
parsed = self._process_raw_table(
|
|
297
|
+
raw_table,
|
|
298
|
+
doc_id=doc_id,
|
|
299
|
+
page_num=page_num,
|
|
300
|
+
node_id=node_id,
|
|
301
|
+
)
|
|
302
|
+
if parsed:
|
|
303
|
+
tables.append(parsed)
|
|
304
|
+
|
|
305
|
+
logger.info("tables_parsed", count=len(tables))
|
|
306
|
+
|
|
307
|
+
return tables
|
|
308
|
+
|
|
309
|
+
def _parse_markdown_tables(self, text: str) -> list[list[list[str]]]:
|
|
310
|
+
"""Parse markdown-style tables."""
|
|
311
|
+
tables = []
|
|
312
|
+
|
|
313
|
+
# Find table blocks (consecutive lines with |)
|
|
314
|
+
lines = text.split('\n')
|
|
315
|
+
current_table: list[list[str]] = []
|
|
316
|
+
|
|
317
|
+
for line in lines:
|
|
318
|
+
line = line.strip()
|
|
319
|
+
|
|
320
|
+
if '|' in line:
|
|
321
|
+
# Skip separator lines
|
|
322
|
+
if re.match(r'^[\|\s\-:]+$', line):
|
|
323
|
+
continue
|
|
324
|
+
|
|
325
|
+
# Parse cells
|
|
326
|
+
cells = [c.strip() for c in line.split('|')]
|
|
327
|
+
# Remove empty first/last if line starts/ends with |
|
|
328
|
+
if cells and cells[0] == '':
|
|
329
|
+
cells = cells[1:]
|
|
330
|
+
if cells and cells[-1] == '':
|
|
331
|
+
cells = cells[:-1]
|
|
332
|
+
|
|
333
|
+
if cells:
|
|
334
|
+
current_table.append(cells)
|
|
335
|
+
else:
|
|
336
|
+
# End of table
|
|
337
|
+
if current_table and len(current_table) >= 2:
|
|
338
|
+
tables.append(current_table)
|
|
339
|
+
current_table = []
|
|
340
|
+
|
|
341
|
+
# Don't forget last table
|
|
342
|
+
if current_table and len(current_table) >= 2:
|
|
343
|
+
tables.append(current_table)
|
|
344
|
+
|
|
345
|
+
return tables
|
|
346
|
+
|
|
347
|
+
def _parse_ascii_tables(self, text: str) -> list[list[list[str]]]:
|
|
348
|
+
"""Parse ASCII-style tables with borders."""
|
|
349
|
+
tables = []
|
|
350
|
+
|
|
351
|
+
# Look for lines with consistent column separators
|
|
352
|
+
lines = text.split('\n')
|
|
353
|
+
|
|
354
|
+
# Find potential table regions
|
|
355
|
+
in_table = False
|
|
356
|
+
current_table: list[list[str]] = []
|
|
357
|
+
column_positions: list[int] = []
|
|
358
|
+
|
|
359
|
+
for line in lines:
|
|
360
|
+
# Check for border lines
|
|
361
|
+
if re.match(r'^[\+\-\=]+$', line) or re.match(r'^[\|\s\-\+]+$', line):
|
|
362
|
+
if not in_table:
|
|
363
|
+
in_table = True
|
|
364
|
+
# Detect column positions from border
|
|
365
|
+
column_positions = [m.start() for m in re.finditer(r'[\+\|]', line)]
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
if in_table and '|' in line:
|
|
369
|
+
# Extract cells based on column positions
|
|
370
|
+
if column_positions:
|
|
371
|
+
cells = []
|
|
372
|
+
for i in range(len(column_positions) - 1):
|
|
373
|
+
start = column_positions[i] + 1
|
|
374
|
+
end = column_positions[i + 1]
|
|
375
|
+
if start < len(line) and end <= len(line):
|
|
376
|
+
cell = line[start:end].strip()
|
|
377
|
+
cells.append(cell)
|
|
378
|
+
if cells:
|
|
379
|
+
current_table.append(cells)
|
|
380
|
+
else:
|
|
381
|
+
# Fallback to simple split
|
|
382
|
+
cells = [c.strip() for c in line.split('|') if c.strip()]
|
|
383
|
+
if cells:
|
|
384
|
+
current_table.append(cells)
|
|
385
|
+
elif in_table and current_table:
|
|
386
|
+
# End of table
|
|
387
|
+
if len(current_table) >= 2:
|
|
388
|
+
tables.append(current_table)
|
|
389
|
+
current_table = []
|
|
390
|
+
in_table = False
|
|
391
|
+
column_positions = []
|
|
392
|
+
|
|
393
|
+
# Don't forget last table
|
|
394
|
+
if current_table and len(current_table) >= 2:
|
|
395
|
+
tables.append(current_table)
|
|
396
|
+
|
|
397
|
+
return tables
|
|
398
|
+
|
|
399
|
+
def _process_raw_table(
|
|
400
|
+
self,
|
|
401
|
+
raw_table: list[list[str]],
|
|
402
|
+
doc_id: str = "",
|
|
403
|
+
page_num: int | None = None,
|
|
404
|
+
node_id: str = "",
|
|
405
|
+
) -> ParsedTable | None:
|
|
406
|
+
"""Process a raw table (list of rows) into ParsedTable."""
|
|
407
|
+
if not raw_table:
|
|
408
|
+
return None
|
|
409
|
+
|
|
410
|
+
# Create table
|
|
411
|
+
table = ParsedTable(
|
|
412
|
+
doc_id=doc_id,
|
|
413
|
+
page_num=page_num,
|
|
414
|
+
node_id=node_id,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
# Determine number of columns
|
|
418
|
+
num_cols = max(len(row) for row in raw_table)
|
|
419
|
+
table.num_cols = num_cols
|
|
420
|
+
table.num_rows = len(raw_table)
|
|
421
|
+
|
|
422
|
+
# Process rows
|
|
423
|
+
for row_idx, raw_row in enumerate(raw_table):
|
|
424
|
+
is_header = self.detect_headers and row_idx == 0
|
|
425
|
+
|
|
426
|
+
row = TableRow(index=row_idx, is_header_row=is_header)
|
|
427
|
+
|
|
428
|
+
for col_idx in range(num_cols):
|
|
429
|
+
value = raw_row[col_idx] if col_idx < len(raw_row) else ""
|
|
430
|
+
|
|
431
|
+
cell = TableCell(
|
|
432
|
+
row=row_idx,
|
|
433
|
+
col=col_idx,
|
|
434
|
+
value=value,
|
|
435
|
+
is_header=is_header,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Infer type
|
|
439
|
+
if self.infer_types:
|
|
440
|
+
cell.data_type, cell.numeric_value = self._infer_cell_type(value)
|
|
441
|
+
|
|
442
|
+
row.cells.append(cell)
|
|
443
|
+
|
|
444
|
+
table.rows.append(row)
|
|
445
|
+
|
|
446
|
+
# Store headers
|
|
447
|
+
if is_header:
|
|
448
|
+
table.headers = [c.value for c in row.cells]
|
|
449
|
+
|
|
450
|
+
# Build columns
|
|
451
|
+
for col_idx in range(num_cols):
|
|
452
|
+
column = TableColumn(
|
|
453
|
+
index=col_idx,
|
|
454
|
+
header=table.headers[col_idx] if col_idx < len(table.headers) else "",
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
for row in table.rows:
|
|
458
|
+
if col_idx < len(row.cells):
|
|
459
|
+
column.cells.append(row.cells[col_idx])
|
|
460
|
+
|
|
461
|
+
# Infer column type from majority
|
|
462
|
+
if self.infer_types:
|
|
463
|
+
column.data_type = self._infer_column_type(column.cells)
|
|
464
|
+
|
|
465
|
+
table.columns.append(column)
|
|
466
|
+
|
|
467
|
+
return table
|
|
468
|
+
|
|
469
|
+
def _infer_cell_type(self, value: str) -> tuple[str, float | None]:
|
|
470
|
+
"""Infer the data type of a cell value."""
|
|
471
|
+
value = value.strip()
|
|
472
|
+
|
|
473
|
+
if not value:
|
|
474
|
+
return "empty", None
|
|
475
|
+
|
|
476
|
+
# Currency
|
|
477
|
+
if re.match(r'^[\$€£¥]\s*[\d,]+\.?\d*$', value):
|
|
478
|
+
num_str = re.sub(r'[^\d.]', '', value)
|
|
479
|
+
try:
|
|
480
|
+
return "currency", float(num_str)
|
|
481
|
+
except ValueError:
|
|
482
|
+
pass
|
|
483
|
+
|
|
484
|
+
# Percentage
|
|
485
|
+
if re.match(r'^[\d.]+\s*%$', value):
|
|
486
|
+
num_str = value.replace('%', '').strip()
|
|
487
|
+
try:
|
|
488
|
+
return "percentage", float(num_str)
|
|
489
|
+
except ValueError:
|
|
490
|
+
pass
|
|
491
|
+
|
|
492
|
+
# Number
|
|
493
|
+
if re.match(r'^-?[\d,]+\.?\d*$', value):
|
|
494
|
+
num_str = value.replace(',', '')
|
|
495
|
+
try:
|
|
496
|
+
return "number", float(num_str)
|
|
497
|
+
except ValueError:
|
|
498
|
+
pass
|
|
499
|
+
|
|
500
|
+
# Date (basic patterns)
|
|
501
|
+
date_patterns = [
|
|
502
|
+
r'^\d{1,2}[/-]\d{1,2}[/-]\d{2,4}$',
|
|
503
|
+
r'^\d{4}[/-]\d{1,2}[/-]\d{1,2}$',
|
|
504
|
+
]
|
|
505
|
+
for pattern in date_patterns:
|
|
506
|
+
if re.match(pattern, value):
|
|
507
|
+
return "date", None
|
|
508
|
+
|
|
509
|
+
return "text", None
|
|
510
|
+
|
|
511
|
+
def _infer_column_type(self, cells: list[TableCell]) -> str:
|
|
512
|
+
"""Infer column type from majority of cells."""
|
|
513
|
+
type_counts: dict[str, int] = {}
|
|
514
|
+
|
|
515
|
+
for cell in cells:
|
|
516
|
+
if not cell.is_header and cell.data_type != "empty":
|
|
517
|
+
type_counts[cell.data_type] = type_counts.get(cell.data_type, 0) + 1
|
|
518
|
+
|
|
519
|
+
if type_counts:
|
|
520
|
+
return max(type_counts.items(), key=lambda x: x[1])[0]
|
|
521
|
+
return "text"
|
|
522
|
+
|
|
523
|
+
def parse_from_html(
|
|
524
|
+
self,
|
|
525
|
+
html: str,
|
|
526
|
+
doc_id: str = "",
|
|
527
|
+
page_num: int | None = None,
|
|
528
|
+
) -> list[ParsedTable]:
|
|
529
|
+
"""Parse tables from HTML."""
|
|
530
|
+
try:
|
|
531
|
+
from bs4 import BeautifulSoup
|
|
532
|
+
except ImportError:
|
|
533
|
+
logger.warning("beautifulsoup4_not_available")
|
|
534
|
+
return []
|
|
535
|
+
|
|
536
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
537
|
+
tables = []
|
|
538
|
+
|
|
539
|
+
for table_elem in soup.find_all('table'):
|
|
540
|
+
parsed = self._parse_html_table(table_elem, doc_id, page_num)
|
|
541
|
+
if parsed:
|
|
542
|
+
tables.append(parsed)
|
|
543
|
+
|
|
544
|
+
return tables
|
|
545
|
+
|
|
546
|
+
def _parse_html_table(
|
|
547
|
+
self,
|
|
548
|
+
table_elem: Any,
|
|
549
|
+
doc_id: str,
|
|
550
|
+
page_num: int | None,
|
|
551
|
+
) -> ParsedTable | None:
|
|
552
|
+
"""Parse a single HTML table element."""
|
|
553
|
+
raw_table: list[list[str]] = []
|
|
554
|
+
|
|
555
|
+
# Get rows
|
|
556
|
+
rows = table_elem.find_all('tr')
|
|
557
|
+
|
|
558
|
+
for row in rows:
|
|
559
|
+
cells = row.find_all(['td', 'th'])
|
|
560
|
+
raw_row = [cell.get_text(strip=True) for cell in cells]
|
|
561
|
+
if raw_row:
|
|
562
|
+
raw_table.append(raw_row)
|
|
563
|
+
|
|
564
|
+
if raw_table:
|
|
565
|
+
return self._process_raw_table(
|
|
566
|
+
raw_table,
|
|
567
|
+
doc_id=doc_id,
|
|
568
|
+
page_num=page_num,
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
return None
|
|
572
|
+
|
|
573
|
+
|
|
574
|
+
# =============================================================================
|
|
575
|
+
# Table Query Engine
|
|
576
|
+
# =============================================================================
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
class TableQueryEngine:
|
|
580
|
+
"""
|
|
581
|
+
Enables SQL-like queries over parsed tables.
|
|
582
|
+
|
|
583
|
+
Supports:
|
|
584
|
+
- SELECT: Get specific columns
|
|
585
|
+
- WHERE: Filter rows
|
|
586
|
+
- ORDER BY: Sort results
|
|
587
|
+
- Aggregations: SUM, AVG, COUNT, MAX, MIN
|
|
588
|
+
"""
|
|
589
|
+
|
|
590
|
+
def __init__(self, table: ParsedTable):
|
|
591
|
+
"""
|
|
592
|
+
Initialize query engine for a table.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
table: The parsed table to query.
|
|
596
|
+
"""
|
|
597
|
+
self.table = table
|
|
598
|
+
|
|
599
|
+
def select(
|
|
600
|
+
self,
|
|
601
|
+
columns: list[str] | None = None,
|
|
602
|
+
where: dict[str, Any] | None = None,
|
|
603
|
+
order_by: str | None = None,
|
|
604
|
+
ascending: bool = True,
|
|
605
|
+
limit: int | None = None,
|
|
606
|
+
) -> list[dict[str, str]]:
|
|
607
|
+
"""
|
|
608
|
+
Select data from the table.
|
|
609
|
+
|
|
610
|
+
Args:
|
|
611
|
+
columns: Column names to select (None = all).
|
|
612
|
+
where: Filter conditions {column: value} or {column: (op, value)}.
|
|
613
|
+
order_by: Column to sort by.
|
|
614
|
+
ascending: Sort direction.
|
|
615
|
+
limit: Maximum rows to return.
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
List of row dictionaries.
|
|
619
|
+
"""
|
|
620
|
+
results = []
|
|
621
|
+
|
|
622
|
+
# Get column indices
|
|
623
|
+
col_indices = {}
|
|
624
|
+
for i, header in enumerate(self.table.headers):
|
|
625
|
+
col_indices[header.lower()] = i
|
|
626
|
+
|
|
627
|
+
# Process data rows
|
|
628
|
+
for row in self.table.rows:
|
|
629
|
+
if row.is_header_row:
|
|
630
|
+
continue
|
|
631
|
+
|
|
632
|
+
# Apply WHERE filter
|
|
633
|
+
if where and not self._matches_where(row, where, col_indices):
|
|
634
|
+
continue
|
|
635
|
+
|
|
636
|
+
# Build result row
|
|
637
|
+
result_row = {}
|
|
638
|
+
for i, cell in enumerate(row.cells):
|
|
639
|
+
header = self.table.headers[i] if i < len(self.table.headers) else f"col_{i}"
|
|
640
|
+
|
|
641
|
+
# Filter columns if specified
|
|
642
|
+
if columns is None or header.lower() in [c.lower() for c in columns]:
|
|
643
|
+
result_row[header] = cell.value
|
|
644
|
+
|
|
645
|
+
results.append(result_row)
|
|
646
|
+
|
|
647
|
+
# Apply ORDER BY
|
|
648
|
+
if order_by and order_by.lower() in col_indices:
|
|
649
|
+
results.sort(
|
|
650
|
+
key=lambda x: x.get(order_by, ""),
|
|
651
|
+
reverse=not ascending,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
# Apply LIMIT
|
|
655
|
+
if limit:
|
|
656
|
+
results = results[:limit]
|
|
657
|
+
|
|
658
|
+
return results
|
|
659
|
+
|
|
660
|
+
def _matches_where(
|
|
661
|
+
self,
|
|
662
|
+
row: TableRow,
|
|
663
|
+
where: dict[str, Any],
|
|
664
|
+
col_indices: dict[str, int],
|
|
665
|
+
) -> bool:
|
|
666
|
+
"""Check if a row matches WHERE conditions."""
|
|
667
|
+
for col_name, condition in where.items():
|
|
668
|
+
col_idx = col_indices.get(col_name.lower())
|
|
669
|
+
if col_idx is None or col_idx >= len(row.cells):
|
|
670
|
+
continue
|
|
671
|
+
|
|
672
|
+
cell_value = row.cells[col_idx].value.lower()
|
|
673
|
+
|
|
674
|
+
if isinstance(condition, tuple):
|
|
675
|
+
op, value = condition
|
|
676
|
+
value = str(value).lower()
|
|
677
|
+
|
|
678
|
+
if op == "eq" and cell_value != value:
|
|
679
|
+
return False
|
|
680
|
+
elif op == "ne" and cell_value == value:
|
|
681
|
+
return False
|
|
682
|
+
elif op == "contains" and value not in cell_value:
|
|
683
|
+
return False
|
|
684
|
+
elif op == "gt" or op == "lt" or op == "gte" or op == "lte":
|
|
685
|
+
try:
|
|
686
|
+
cell_num = float(cell_value.replace(',', '').replace('$', ''))
|
|
687
|
+
val_num = float(value.replace(',', '').replace('$', ''))
|
|
688
|
+
|
|
689
|
+
if op == "gt" and cell_num <= val_num:
|
|
690
|
+
return False
|
|
691
|
+
elif op == "lt" and cell_num >= val_num:
|
|
692
|
+
return False
|
|
693
|
+
elif op == "gte" and cell_num < val_num:
|
|
694
|
+
return False
|
|
695
|
+
elif op == "lte" and cell_num > val_num:
|
|
696
|
+
return False
|
|
697
|
+
except ValueError:
|
|
698
|
+
return False
|
|
699
|
+
else:
|
|
700
|
+
# Simple equality
|
|
701
|
+
if cell_value != str(condition).lower():
|
|
702
|
+
return False
|
|
703
|
+
|
|
704
|
+
return True
|
|
705
|
+
|
|
706
|
+
def aggregate(
|
|
707
|
+
self,
|
|
708
|
+
column: str,
|
|
709
|
+
operation: Literal["sum", "avg", "count", "max", "min"],
|
|
710
|
+
where: dict[str, Any] | None = None,
|
|
711
|
+
) -> float | int:
|
|
712
|
+
"""
|
|
713
|
+
Perform aggregation on a column.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
column: Column to aggregate.
|
|
717
|
+
operation: Aggregation operation.
|
|
718
|
+
where: Optional filter.
|
|
719
|
+
|
|
720
|
+
Returns:
|
|
721
|
+
Aggregated value.
|
|
722
|
+
"""
|
|
723
|
+
col_indices = {h.lower(): i for i, h in enumerate(self.table.headers)}
|
|
724
|
+
col_idx = col_indices.get(column.lower())
|
|
725
|
+
|
|
726
|
+
if col_idx is None:
|
|
727
|
+
return 0
|
|
728
|
+
|
|
729
|
+
values = []
|
|
730
|
+
|
|
731
|
+
for row in self.table.rows:
|
|
732
|
+
if row.is_header_row:
|
|
733
|
+
continue
|
|
734
|
+
|
|
735
|
+
if where and not self._matches_where(row, where, col_indices):
|
|
736
|
+
continue
|
|
737
|
+
|
|
738
|
+
if col_idx < len(row.cells):
|
|
739
|
+
cell = row.cells[col_idx]
|
|
740
|
+
if cell.numeric_value is not None:
|
|
741
|
+
values.append(cell.numeric_value)
|
|
742
|
+
elif cell.data_type in ["number", "currency", "percentage"]:
|
|
743
|
+
try:
|
|
744
|
+
num = float(cell.value.replace(',', '').replace('$', '').replace('%', ''))
|
|
745
|
+
values.append(num)
|
|
746
|
+
except ValueError:
|
|
747
|
+
pass
|
|
748
|
+
|
|
749
|
+
if not values:
|
|
750
|
+
return 0
|
|
751
|
+
|
|
752
|
+
if operation == "sum":
|
|
753
|
+
return sum(values)
|
|
754
|
+
elif operation == "avg":
|
|
755
|
+
return sum(values) / len(values)
|
|
756
|
+
elif operation == "count":
|
|
757
|
+
return len(values)
|
|
758
|
+
elif operation == "max":
|
|
759
|
+
return max(values)
|
|
760
|
+
elif operation == "min":
|
|
761
|
+
return min(values)
|
|
762
|
+
|
|
763
|
+
return 0
|
|
764
|
+
|
|
765
|
+
def get_cell_value(self, row: int, column: str | int) -> str | None:
|
|
766
|
+
"""Get a specific cell value."""
|
|
767
|
+
if isinstance(column, str):
|
|
768
|
+
col = self.table.get_column_by_header(column)
|
|
769
|
+
if col:
|
|
770
|
+
column = col.index
|
|
771
|
+
else:
|
|
772
|
+
return None
|
|
773
|
+
|
|
774
|
+
cell = self.table.get_cell(row, column)
|
|
775
|
+
return cell.value if cell else None
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
# =============================================================================
|
|
779
|
+
# Convenience Functions
|
|
780
|
+
# =============================================================================
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def parse_tables_from_text(
|
|
784
|
+
text: str,
|
|
785
|
+
doc_id: str = "",
|
|
786
|
+
) -> list[ParsedTable]:
|
|
787
|
+
"""Parse all tables from text."""
|
|
788
|
+
parser = TableParser()
|
|
789
|
+
return parser.parse_from_text(text, doc_id=doc_id)
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def query_table(
|
|
793
|
+
table: ParsedTable,
|
|
794
|
+
columns: list[str] | None = None,
|
|
795
|
+
where: dict[str, Any] | None = None,
|
|
796
|
+
) -> list[dict[str, str]]:
|
|
797
|
+
"""Run a simple query on a table."""
|
|
798
|
+
engine = TableQueryEngine(table)
|
|
799
|
+
return engine.select(columns=columns, where=where)
|