rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,799 @@
1
+ """
2
+ RNSR Table Parser
3
+
4
+ Deep parsing of tables from documents.
5
+ Extracts structure, headers, cells, and enables cell-level retrieval.
6
+
7
+ Features:
8
+ - Table structure extraction (rows, columns, headers)
9
+ - Cell-level retrieval (answer questions about specific cells)
10
+ - SQL-like queries over extracted table data
11
+ - Support for merged cells, multi-level headers
12
+
13
+ Integrates with the ingestion pipeline to extract tables from PDFs.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import re
20
+ from dataclasses import dataclass, field
21
+ from typing import Any, Literal
22
+ from uuid import uuid4
23
+
24
+ import structlog
25
+
26
+ logger = structlog.get_logger(__name__)
27
+
28
+
29
+ # =============================================================================
30
+ # Data Models
31
+ # =============================================================================
32
+
33
+
34
+ @dataclass
35
+ class TableCell:
36
+ """A single cell in a table."""
37
+
38
+ row: int
39
+ col: int
40
+ value: str
41
+
42
+ # Cell properties
43
+ is_header: bool = False
44
+ is_merged: bool = False
45
+ rowspan: int = 1
46
+ colspan: int = 1
47
+
48
+ # Type inference
49
+ data_type: Literal["text", "number", "currency", "percentage", "date", "empty"] = "text"
50
+ numeric_value: float | None = None
51
+
52
+ def to_dict(self) -> dict[str, Any]:
53
+ """Convert to dictionary."""
54
+ return {
55
+ "row": self.row,
56
+ "col": self.col,
57
+ "value": self.value,
58
+ "is_header": self.is_header,
59
+ "is_merged": self.is_merged,
60
+ "rowspan": self.rowspan,
61
+ "colspan": self.colspan,
62
+ "data_type": self.data_type,
63
+ "numeric_value": self.numeric_value,
64
+ }
65
+
66
+
67
+ @dataclass
68
+ class TableRow:
69
+ """A row in a table."""
70
+
71
+ index: int
72
+ cells: list[TableCell] = field(default_factory=list)
73
+ is_header_row: bool = False
74
+
75
+ def to_dict(self) -> dict[str, Any]:
76
+ """Convert to dictionary."""
77
+ return {
78
+ "index": self.index,
79
+ "cells": [c.to_dict() for c in self.cells],
80
+ "is_header_row": self.is_header_row,
81
+ }
82
+
83
+ def get_values(self) -> list[str]:
84
+ """Get all cell values in row."""
85
+ return [c.value for c in self.cells]
86
+
87
+
88
+ @dataclass
89
+ class TableColumn:
90
+ """A column in a table."""
91
+
92
+ index: int
93
+ header: str = ""
94
+ data_type: str = "text"
95
+ cells: list[TableCell] = field(default_factory=list)
96
+
97
+ def to_dict(self) -> dict[str, Any]:
98
+ """Convert to dictionary."""
99
+ return {
100
+ "index": self.index,
101
+ "header": self.header,
102
+ "data_type": self.data_type,
103
+ "cell_count": len(self.cells),
104
+ }
105
+
106
+ def get_values(self) -> list[str]:
107
+ """Get all cell values in column."""
108
+ return [c.value for c in self.cells]
109
+
110
+
111
+ @dataclass
112
+ class ParsedTable:
113
+ """A fully parsed table."""
114
+
115
+ id: str = field(default_factory=lambda: f"table_{str(uuid4())[:8]}")
116
+
117
+ # Source information
118
+ doc_id: str = ""
119
+ page_num: int | None = None
120
+ node_id: str = ""
121
+
122
+ # Table metadata
123
+ title: str = ""
124
+ caption: str = ""
125
+
126
+ # Structure
127
+ rows: list[TableRow] = field(default_factory=list)
128
+ columns: list[TableColumn] = field(default_factory=list)
129
+
130
+ # Dimensions
131
+ num_rows: int = 0
132
+ num_cols: int = 0
133
+ header_rows: int = 1
134
+
135
+ # Content
136
+ headers: list[str] = field(default_factory=list)
137
+
138
+ def to_dict(self) -> dict[str, Any]:
139
+ """Convert to dictionary."""
140
+ return {
141
+ "id": self.id,
142
+ "doc_id": self.doc_id,
143
+ "page_num": self.page_num,
144
+ "node_id": self.node_id,
145
+ "title": self.title,
146
+ "caption": self.caption,
147
+ "num_rows": self.num_rows,
148
+ "num_cols": self.num_cols,
149
+ "header_rows": self.header_rows,
150
+ "headers": self.headers,
151
+ "rows": [r.to_dict() for r in self.rows],
152
+ "columns": [c.to_dict() for c in self.columns],
153
+ }
154
+
155
+ def get_cell(self, row: int, col: int) -> TableCell | None:
156
+ """Get a specific cell."""
157
+ if 0 <= row < len(self.rows):
158
+ row_obj = self.rows[row]
159
+ if 0 <= col < len(row_obj.cells):
160
+ return row_obj.cells[col]
161
+ return None
162
+
163
+ def get_row(self, index: int) -> TableRow | None:
164
+ """Get a specific row."""
165
+ if 0 <= index < len(self.rows):
166
+ return self.rows[index]
167
+ return None
168
+
169
+ def get_column(self, index: int) -> TableColumn | None:
170
+ """Get a specific column."""
171
+ if 0 <= index < len(self.columns):
172
+ return self.columns[index]
173
+ return None
174
+
175
+ def get_column_by_header(self, header: str) -> TableColumn | None:
176
+ """Get column by header name."""
177
+ header_lower = header.lower()
178
+ for col in self.columns:
179
+ if col.header.lower() == header_lower:
180
+ return col
181
+ return None
182
+
183
+ def to_markdown(self) -> str:
184
+ """Convert table to markdown format."""
185
+ if not self.rows:
186
+ return ""
187
+
188
+ lines = []
189
+
190
+ # Title
191
+ if self.title:
192
+ lines.append(f"**{self.title}**\n")
193
+
194
+ # Headers
195
+ if self.headers:
196
+ lines.append("| " + " | ".join(self.headers) + " |")
197
+ lines.append("|" + "|".join(["---"] * len(self.headers)) + "|")
198
+
199
+ # Data rows
200
+ for row in self.rows:
201
+ if not row.is_header_row:
202
+ values = [c.value for c in row.cells]
203
+ lines.append("| " + " | ".join(values) + " |")
204
+
205
+ # Caption
206
+ if self.caption:
207
+ lines.append(f"\n*{self.caption}*")
208
+
209
+ return "\n".join(lines)
210
+
211
+ def to_csv(self) -> str:
212
+ """Convert table to CSV format."""
213
+ lines = []
214
+
215
+ for row in self.rows:
216
+ values = []
217
+ for cell in row.cells:
218
+ # Escape commas and quotes
219
+ value = cell.value.replace('"', '""')
220
+ if ',' in value or '"' in value:
221
+ value = f'"{value}"'
222
+ values.append(value)
223
+ lines.append(",".join(values))
224
+
225
+ return "\n".join(lines)
226
+
227
+
228
+ # =============================================================================
229
+ # Table Parser
230
+ # =============================================================================
231
+
232
+
233
+ class TableParser:
234
+ """
235
+ Parses tables from various formats.
236
+
237
+ Supports:
238
+ - Plain text tables (markdown, ASCII)
239
+ - HTML tables
240
+ - PDF table extraction (via PyMuPDF)
241
+ """
242
+
243
+ def __init__(
244
+ self,
245
+ infer_types: bool = True,
246
+ detect_headers: bool = True,
247
+ ):
248
+ """
249
+ Initialize the table parser.
250
+
251
+ Args:
252
+ infer_types: Whether to infer cell data types.
253
+ detect_headers: Whether to detect header rows.
254
+ """
255
+ self.infer_types = infer_types
256
+ self.detect_headers = detect_headers
257
+
258
+ def parse_from_text(
259
+ self,
260
+ text: str,
261
+ doc_id: str = "",
262
+ page_num: int | None = None,
263
+ node_id: str = "",
264
+ ) -> list[ParsedTable]:
265
+ """
266
+ Parse tables from plain text.
267
+
268
+ Args:
269
+ text: Text containing table(s).
270
+ doc_id: Document ID.
271
+ page_num: Page number.
272
+ node_id: Node ID.
273
+
274
+ Returns:
275
+ List of ParsedTable objects.
276
+ """
277
+ tables = []
278
+
279
+ # Try markdown table parsing
280
+ markdown_tables = self._parse_markdown_tables(text)
281
+
282
+ for raw_table in markdown_tables:
283
+ parsed = self._process_raw_table(
284
+ raw_table,
285
+ doc_id=doc_id,
286
+ page_num=page_num,
287
+ node_id=node_id,
288
+ )
289
+ if parsed:
290
+ tables.append(parsed)
291
+
292
+ # Try ASCII table parsing if no markdown found
293
+ if not tables:
294
+ ascii_tables = self._parse_ascii_tables(text)
295
+ for raw_table in ascii_tables:
296
+ parsed = self._process_raw_table(
297
+ raw_table,
298
+ doc_id=doc_id,
299
+ page_num=page_num,
300
+ node_id=node_id,
301
+ )
302
+ if parsed:
303
+ tables.append(parsed)
304
+
305
+ logger.info("tables_parsed", count=len(tables))
306
+
307
+ return tables
308
+
309
+ def _parse_markdown_tables(self, text: str) -> list[list[list[str]]]:
310
+ """Parse markdown-style tables."""
311
+ tables = []
312
+
313
+ # Find table blocks (consecutive lines with |)
314
+ lines = text.split('\n')
315
+ current_table: list[list[str]] = []
316
+
317
+ for line in lines:
318
+ line = line.strip()
319
+
320
+ if '|' in line:
321
+ # Skip separator lines
322
+ if re.match(r'^[\|\s\-:]+$', line):
323
+ continue
324
+
325
+ # Parse cells
326
+ cells = [c.strip() for c in line.split('|')]
327
+ # Remove empty first/last if line starts/ends with |
328
+ if cells and cells[0] == '':
329
+ cells = cells[1:]
330
+ if cells and cells[-1] == '':
331
+ cells = cells[:-1]
332
+
333
+ if cells:
334
+ current_table.append(cells)
335
+ else:
336
+ # End of table
337
+ if current_table and len(current_table) >= 2:
338
+ tables.append(current_table)
339
+ current_table = []
340
+
341
+ # Don't forget last table
342
+ if current_table and len(current_table) >= 2:
343
+ tables.append(current_table)
344
+
345
+ return tables
346
+
347
+ def _parse_ascii_tables(self, text: str) -> list[list[list[str]]]:
348
+ """Parse ASCII-style tables with borders."""
349
+ tables = []
350
+
351
+ # Look for lines with consistent column separators
352
+ lines = text.split('\n')
353
+
354
+ # Find potential table regions
355
+ in_table = False
356
+ current_table: list[list[str]] = []
357
+ column_positions: list[int] = []
358
+
359
+ for line in lines:
360
+ # Check for border lines
361
+ if re.match(r'^[\+\-\=]+$', line) or re.match(r'^[\|\s\-\+]+$', line):
362
+ if not in_table:
363
+ in_table = True
364
+ # Detect column positions from border
365
+ column_positions = [m.start() for m in re.finditer(r'[\+\|]', line)]
366
+ continue
367
+
368
+ if in_table and '|' in line:
369
+ # Extract cells based on column positions
370
+ if column_positions:
371
+ cells = []
372
+ for i in range(len(column_positions) - 1):
373
+ start = column_positions[i] + 1
374
+ end = column_positions[i + 1]
375
+ if start < len(line) and end <= len(line):
376
+ cell = line[start:end].strip()
377
+ cells.append(cell)
378
+ if cells:
379
+ current_table.append(cells)
380
+ else:
381
+ # Fallback to simple split
382
+ cells = [c.strip() for c in line.split('|') if c.strip()]
383
+ if cells:
384
+ current_table.append(cells)
385
+ elif in_table and current_table:
386
+ # End of table
387
+ if len(current_table) >= 2:
388
+ tables.append(current_table)
389
+ current_table = []
390
+ in_table = False
391
+ column_positions = []
392
+
393
+ # Don't forget last table
394
+ if current_table and len(current_table) >= 2:
395
+ tables.append(current_table)
396
+
397
+ return tables
398
+
399
+ def _process_raw_table(
400
+ self,
401
+ raw_table: list[list[str]],
402
+ doc_id: str = "",
403
+ page_num: int | None = None,
404
+ node_id: str = "",
405
+ ) -> ParsedTable | None:
406
+ """Process a raw table (list of rows) into ParsedTable."""
407
+ if not raw_table:
408
+ return None
409
+
410
+ # Create table
411
+ table = ParsedTable(
412
+ doc_id=doc_id,
413
+ page_num=page_num,
414
+ node_id=node_id,
415
+ )
416
+
417
+ # Determine number of columns
418
+ num_cols = max(len(row) for row in raw_table)
419
+ table.num_cols = num_cols
420
+ table.num_rows = len(raw_table)
421
+
422
+ # Process rows
423
+ for row_idx, raw_row in enumerate(raw_table):
424
+ is_header = self.detect_headers and row_idx == 0
425
+
426
+ row = TableRow(index=row_idx, is_header_row=is_header)
427
+
428
+ for col_idx in range(num_cols):
429
+ value = raw_row[col_idx] if col_idx < len(raw_row) else ""
430
+
431
+ cell = TableCell(
432
+ row=row_idx,
433
+ col=col_idx,
434
+ value=value,
435
+ is_header=is_header,
436
+ )
437
+
438
+ # Infer type
439
+ if self.infer_types:
440
+ cell.data_type, cell.numeric_value = self._infer_cell_type(value)
441
+
442
+ row.cells.append(cell)
443
+
444
+ table.rows.append(row)
445
+
446
+ # Store headers
447
+ if is_header:
448
+ table.headers = [c.value for c in row.cells]
449
+
450
+ # Build columns
451
+ for col_idx in range(num_cols):
452
+ column = TableColumn(
453
+ index=col_idx,
454
+ header=table.headers[col_idx] if col_idx < len(table.headers) else "",
455
+ )
456
+
457
+ for row in table.rows:
458
+ if col_idx < len(row.cells):
459
+ column.cells.append(row.cells[col_idx])
460
+
461
+ # Infer column type from majority
462
+ if self.infer_types:
463
+ column.data_type = self._infer_column_type(column.cells)
464
+
465
+ table.columns.append(column)
466
+
467
+ return table
468
+
469
+ def _infer_cell_type(self, value: str) -> tuple[str, float | None]:
470
+ """Infer the data type of a cell value."""
471
+ value = value.strip()
472
+
473
+ if not value:
474
+ return "empty", None
475
+
476
+ # Currency
477
+ if re.match(r'^[\$€£¥]\s*[\d,]+\.?\d*$', value):
478
+ num_str = re.sub(r'[^\d.]', '', value)
479
+ try:
480
+ return "currency", float(num_str)
481
+ except ValueError:
482
+ pass
483
+
484
+ # Percentage
485
+ if re.match(r'^[\d.]+\s*%$', value):
486
+ num_str = value.replace('%', '').strip()
487
+ try:
488
+ return "percentage", float(num_str)
489
+ except ValueError:
490
+ pass
491
+
492
+ # Number
493
+ if re.match(r'^-?[\d,]+\.?\d*$', value):
494
+ num_str = value.replace(',', '')
495
+ try:
496
+ return "number", float(num_str)
497
+ except ValueError:
498
+ pass
499
+
500
+ # Date (basic patterns)
501
+ date_patterns = [
502
+ r'^\d{1,2}[/-]\d{1,2}[/-]\d{2,4}$',
503
+ r'^\d{4}[/-]\d{1,2}[/-]\d{1,2}$',
504
+ ]
505
+ for pattern in date_patterns:
506
+ if re.match(pattern, value):
507
+ return "date", None
508
+
509
+ return "text", None
510
+
511
+ def _infer_column_type(self, cells: list[TableCell]) -> str:
512
+ """Infer column type from majority of cells."""
513
+ type_counts: dict[str, int] = {}
514
+
515
+ for cell in cells:
516
+ if not cell.is_header and cell.data_type != "empty":
517
+ type_counts[cell.data_type] = type_counts.get(cell.data_type, 0) + 1
518
+
519
+ if type_counts:
520
+ return max(type_counts.items(), key=lambda x: x[1])[0]
521
+ return "text"
522
+
523
+ def parse_from_html(
524
+ self,
525
+ html: str,
526
+ doc_id: str = "",
527
+ page_num: int | None = None,
528
+ ) -> list[ParsedTable]:
529
+ """Parse tables from HTML."""
530
+ try:
531
+ from bs4 import BeautifulSoup
532
+ except ImportError:
533
+ logger.warning("beautifulsoup4_not_available")
534
+ return []
535
+
536
+ soup = BeautifulSoup(html, 'html.parser')
537
+ tables = []
538
+
539
+ for table_elem in soup.find_all('table'):
540
+ parsed = self._parse_html_table(table_elem, doc_id, page_num)
541
+ if parsed:
542
+ tables.append(parsed)
543
+
544
+ return tables
545
+
546
+ def _parse_html_table(
547
+ self,
548
+ table_elem: Any,
549
+ doc_id: str,
550
+ page_num: int | None,
551
+ ) -> ParsedTable | None:
552
+ """Parse a single HTML table element."""
553
+ raw_table: list[list[str]] = []
554
+
555
+ # Get rows
556
+ rows = table_elem.find_all('tr')
557
+
558
+ for row in rows:
559
+ cells = row.find_all(['td', 'th'])
560
+ raw_row = [cell.get_text(strip=True) for cell in cells]
561
+ if raw_row:
562
+ raw_table.append(raw_row)
563
+
564
+ if raw_table:
565
+ return self._process_raw_table(
566
+ raw_table,
567
+ doc_id=doc_id,
568
+ page_num=page_num,
569
+ )
570
+
571
+ return None
572
+
573
+
574
+ # =============================================================================
575
+ # Table Query Engine
576
+ # =============================================================================
577
+
578
+
579
+ class TableQueryEngine:
580
+ """
581
+ Enables SQL-like queries over parsed tables.
582
+
583
+ Supports:
584
+ - SELECT: Get specific columns
585
+ - WHERE: Filter rows
586
+ - ORDER BY: Sort results
587
+ - Aggregations: SUM, AVG, COUNT, MAX, MIN
588
+ """
589
+
590
+ def __init__(self, table: ParsedTable):
591
+ """
592
+ Initialize query engine for a table.
593
+
594
+ Args:
595
+ table: The parsed table to query.
596
+ """
597
+ self.table = table
598
+
599
+ def select(
600
+ self,
601
+ columns: list[str] | None = None,
602
+ where: dict[str, Any] | None = None,
603
+ order_by: str | None = None,
604
+ ascending: bool = True,
605
+ limit: int | None = None,
606
+ ) -> list[dict[str, str]]:
607
+ """
608
+ Select data from the table.
609
+
610
+ Args:
611
+ columns: Column names to select (None = all).
612
+ where: Filter conditions {column: value} or {column: (op, value)}.
613
+ order_by: Column to sort by.
614
+ ascending: Sort direction.
615
+ limit: Maximum rows to return.
616
+
617
+ Returns:
618
+ List of row dictionaries.
619
+ """
620
+ results = []
621
+
622
+ # Get column indices
623
+ col_indices = {}
624
+ for i, header in enumerate(self.table.headers):
625
+ col_indices[header.lower()] = i
626
+
627
+ # Process data rows
628
+ for row in self.table.rows:
629
+ if row.is_header_row:
630
+ continue
631
+
632
+ # Apply WHERE filter
633
+ if where and not self._matches_where(row, where, col_indices):
634
+ continue
635
+
636
+ # Build result row
637
+ result_row = {}
638
+ for i, cell in enumerate(row.cells):
639
+ header = self.table.headers[i] if i < len(self.table.headers) else f"col_{i}"
640
+
641
+ # Filter columns if specified
642
+ if columns is None or header.lower() in [c.lower() for c in columns]:
643
+ result_row[header] = cell.value
644
+
645
+ results.append(result_row)
646
+
647
+ # Apply ORDER BY
648
+ if order_by and order_by.lower() in col_indices:
649
+ results.sort(
650
+ key=lambda x: x.get(order_by, ""),
651
+ reverse=not ascending,
652
+ )
653
+
654
+ # Apply LIMIT
655
+ if limit:
656
+ results = results[:limit]
657
+
658
+ return results
659
+
660
+ def _matches_where(
661
+ self,
662
+ row: TableRow,
663
+ where: dict[str, Any],
664
+ col_indices: dict[str, int],
665
+ ) -> bool:
666
+ """Check if a row matches WHERE conditions."""
667
+ for col_name, condition in where.items():
668
+ col_idx = col_indices.get(col_name.lower())
669
+ if col_idx is None or col_idx >= len(row.cells):
670
+ continue
671
+
672
+ cell_value = row.cells[col_idx].value.lower()
673
+
674
+ if isinstance(condition, tuple):
675
+ op, value = condition
676
+ value = str(value).lower()
677
+
678
+ if op == "eq" and cell_value != value:
679
+ return False
680
+ elif op == "ne" and cell_value == value:
681
+ return False
682
+ elif op == "contains" and value not in cell_value:
683
+ return False
684
+ elif op == "gt" or op == "lt" or op == "gte" or op == "lte":
685
+ try:
686
+ cell_num = float(cell_value.replace(',', '').replace('$', ''))
687
+ val_num = float(value.replace(',', '').replace('$', ''))
688
+
689
+ if op == "gt" and cell_num <= val_num:
690
+ return False
691
+ elif op == "lt" and cell_num >= val_num:
692
+ return False
693
+ elif op == "gte" and cell_num < val_num:
694
+ return False
695
+ elif op == "lte" and cell_num > val_num:
696
+ return False
697
+ except ValueError:
698
+ return False
699
+ else:
700
+ # Simple equality
701
+ if cell_value != str(condition).lower():
702
+ return False
703
+
704
+ return True
705
+
706
+ def aggregate(
707
+ self,
708
+ column: str,
709
+ operation: Literal["sum", "avg", "count", "max", "min"],
710
+ where: dict[str, Any] | None = None,
711
+ ) -> float | int:
712
+ """
713
+ Perform aggregation on a column.
714
+
715
+ Args:
716
+ column: Column to aggregate.
717
+ operation: Aggregation operation.
718
+ where: Optional filter.
719
+
720
+ Returns:
721
+ Aggregated value.
722
+ """
723
+ col_indices = {h.lower(): i for i, h in enumerate(self.table.headers)}
724
+ col_idx = col_indices.get(column.lower())
725
+
726
+ if col_idx is None:
727
+ return 0
728
+
729
+ values = []
730
+
731
+ for row in self.table.rows:
732
+ if row.is_header_row:
733
+ continue
734
+
735
+ if where and not self._matches_where(row, where, col_indices):
736
+ continue
737
+
738
+ if col_idx < len(row.cells):
739
+ cell = row.cells[col_idx]
740
+ if cell.numeric_value is not None:
741
+ values.append(cell.numeric_value)
742
+ elif cell.data_type in ["number", "currency", "percentage"]:
743
+ try:
744
+ num = float(cell.value.replace(',', '').replace('$', '').replace('%', ''))
745
+ values.append(num)
746
+ except ValueError:
747
+ pass
748
+
749
+ if not values:
750
+ return 0
751
+
752
+ if operation == "sum":
753
+ return sum(values)
754
+ elif operation == "avg":
755
+ return sum(values) / len(values)
756
+ elif operation == "count":
757
+ return len(values)
758
+ elif operation == "max":
759
+ return max(values)
760
+ elif operation == "min":
761
+ return min(values)
762
+
763
+ return 0
764
+
765
+ def get_cell_value(self, row: int, column: str | int) -> str | None:
766
+ """Get a specific cell value."""
767
+ if isinstance(column, str):
768
+ col = self.table.get_column_by_header(column)
769
+ if col:
770
+ column = col.index
771
+ else:
772
+ return None
773
+
774
+ cell = self.table.get_cell(row, column)
775
+ return cell.value if cell else None
776
+
777
+
778
+ # =============================================================================
779
+ # Convenience Functions
780
+ # =============================================================================
781
+
782
+
783
+ def parse_tables_from_text(
784
+ text: str,
785
+ doc_id: str = "",
786
+ ) -> list[ParsedTable]:
787
+ """Parse all tables from text."""
788
+ parser = TableParser()
789
+ return parser.parse_from_text(text, doc_id=doc_id)
790
+
791
+
792
+ def query_table(
793
+ table: ParsedTable,
794
+ columns: list[str] | None = None,
795
+ where: dict[str, Any] | None = None,
796
+ ) -> list[dict[str, str]]:
797
+ """Run a simple query on a table."""
798
+ engine = TableQueryEngine(table)
799
+ return engine.select(columns=columns, where=where)