rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,569 @@
1
+ """
2
+ RNSR Chart Parser
3
+
4
+ Extracts and interprets charts from documents.
5
+ Enables answering questions about visual data representations.
6
+
7
+ Features:
8
+ - Chart type detection (bar, line, pie, scatter)
9
+ - Data point extraction
10
+ - Trend analysis
11
+ - LLM-based chart interpretation
12
+
13
+ Note: Requires vision capabilities for actual chart parsing.
14
+ This module provides the framework and LLM interpretation layer.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import re
21
+ from dataclasses import dataclass, field
22
+ from enum import Enum
23
+ from typing import Any, Callable
24
+ from uuid import uuid4
25
+
26
+ import structlog
27
+
28
+ logger = structlog.get_logger(__name__)
29
+
30
+
31
+ # =============================================================================
32
+ # Data Models
33
+ # =============================================================================
34
+
35
+
36
+ class ChartType(str, Enum):
37
+ """Types of charts."""
38
+
39
+ BAR = "bar"
40
+ LINE = "line"
41
+ PIE = "pie"
42
+ SCATTER = "scatter"
43
+ AREA = "area"
44
+ HISTOGRAM = "histogram"
45
+ STACKED_BAR = "stacked_bar"
46
+ COMBINATION = "combination"
47
+ UNKNOWN = "unknown"
48
+
49
+
50
+ @dataclass
51
+ class DataPoint:
52
+ """A single data point in a chart."""
53
+
54
+ x: str | float # X-axis value or label
55
+ y: float # Y-axis value
56
+ series: str = "" # Series name (for multi-series charts)
57
+ label: str = "" # Display label
58
+
59
+ def to_dict(self) -> dict[str, Any]:
60
+ """Convert to dictionary."""
61
+ return {
62
+ "x": self.x,
63
+ "y": self.y,
64
+ "series": self.series,
65
+ "label": self.label,
66
+ }
67
+
68
+
69
+ @dataclass
70
+ class ChartSeries:
71
+ """A data series in a chart."""
72
+
73
+ name: str = ""
74
+ data_points: list[DataPoint] = field(default_factory=list)
75
+ color: str = ""
76
+
77
+ def to_dict(self) -> dict[str, Any]:
78
+ """Convert to dictionary."""
79
+ return {
80
+ "name": self.name,
81
+ "data_points": [dp.to_dict() for dp in self.data_points],
82
+ "color": self.color,
83
+ }
84
+
85
+ def get_values(self) -> list[float]:
86
+ """Get all Y values."""
87
+ return [dp.y for dp in self.data_points]
88
+
89
+ def get_labels(self) -> list[str]:
90
+ """Get all X labels."""
91
+ return [str(dp.x) for dp in self.data_points]
92
+
93
+
94
+ @dataclass
95
+ class ChartAnalysis:
96
+ """Analysis of chart trends and insights."""
97
+
98
+ trend: str = "" # "increasing", "decreasing", "stable", "fluctuating"
99
+ trend_description: str = ""
100
+ min_value: float | None = None
101
+ max_value: float | None = None
102
+ avg_value: float | None = None
103
+ key_insights: list[str] = field(default_factory=list)
104
+
105
+ def to_dict(self) -> dict[str, Any]:
106
+ """Convert to dictionary."""
107
+ return {
108
+ "trend": self.trend,
109
+ "trend_description": self.trend_description,
110
+ "min_value": self.min_value,
111
+ "max_value": self.max_value,
112
+ "avg_value": self.avg_value,
113
+ "key_insights": self.key_insights,
114
+ }
115
+
116
+
117
+ @dataclass
118
+ class ParsedChart:
119
+ """A fully parsed chart."""
120
+
121
+ id: str = field(default_factory=lambda: f"chart_{str(uuid4())[:8]}")
122
+
123
+ # Source information
124
+ doc_id: str = ""
125
+ page_num: int | None = None
126
+ node_id: str = ""
127
+
128
+ # Chart metadata
129
+ title: str = ""
130
+ chart_type: ChartType = ChartType.UNKNOWN
131
+
132
+ # Axis information
133
+ x_axis_label: str = ""
134
+ y_axis_label: str = ""
135
+ x_axis_unit: str = ""
136
+ y_axis_unit: str = ""
137
+
138
+ # Data
139
+ series: list[ChartSeries] = field(default_factory=list)
140
+
141
+ # Analysis
142
+ analysis: ChartAnalysis | None = None
143
+
144
+ # LLM interpretation
145
+ description: str = ""
146
+
147
+ def to_dict(self) -> dict[str, Any]:
148
+ """Convert to dictionary."""
149
+ return {
150
+ "id": self.id,
151
+ "doc_id": self.doc_id,
152
+ "page_num": self.page_num,
153
+ "node_id": self.node_id,
154
+ "title": self.title,
155
+ "chart_type": self.chart_type.value,
156
+ "x_axis_label": self.x_axis_label,
157
+ "y_axis_label": self.y_axis_label,
158
+ "x_axis_unit": self.x_axis_unit,
159
+ "y_axis_unit": self.y_axis_unit,
160
+ "series": [s.to_dict() for s in self.series],
161
+ "analysis": self.analysis.to_dict() if self.analysis else None,
162
+ "description": self.description,
163
+ }
164
+
165
+ def get_all_values(self) -> list[float]:
166
+ """Get all Y values from all series."""
167
+ values = []
168
+ for series in self.series:
169
+ values.extend(series.get_values())
170
+ return values
171
+
172
+ def summarize(self) -> str:
173
+ """Generate a text summary of the chart."""
174
+ parts = []
175
+
176
+ if self.title:
177
+ parts.append(f"Chart: {self.title}")
178
+
179
+ parts.append(f"Type: {self.chart_type.value}")
180
+
181
+ if self.series:
182
+ parts.append(f"Series: {len(self.series)}")
183
+ total_points = sum(len(s.data_points) for s in self.series)
184
+ parts.append(f"Data points: {total_points}")
185
+
186
+ if self.analysis:
187
+ if self.analysis.trend:
188
+ parts.append(f"Trend: {self.analysis.trend}")
189
+ if self.analysis.min_value is not None:
190
+ parts.append(f"Range: {self.analysis.min_value:.2f} - {self.analysis.max_value:.2f}")
191
+
192
+ return "\n".join(parts)
193
+
194
+
195
+ # =============================================================================
196
+ # Chart Interpretation Prompts
197
+ # =============================================================================
198
+
199
+ CHART_INTERPRETATION_PROMPT = """Analyze this chart image and extract information.
200
+
201
+ Describe:
202
+ 1. CHART TYPE: What type of chart is this? (bar, line, pie, scatter, etc.)
203
+ 2. TITLE: What is the chart title?
204
+ 3. AXES: What are the X and Y axis labels and units?
205
+ 4. DATA: List the data points you can identify (approximate values are fine)
206
+ 5. TREND: Is there a trend (increasing, decreasing, stable)?
207
+ 6. KEY INSIGHTS: What are the main takeaways from this chart?
208
+
209
+ Respond in JSON:
210
+ {{
211
+ "chart_type": "bar|line|pie|scatter|area|histogram|unknown",
212
+ "title": "...",
213
+ "x_axis": {{"label": "...", "unit": "..."}},
214
+ "y_axis": {{"label": "...", "unit": "..."}},
215
+ "series": [
216
+ {{
217
+ "name": "Series name",
218
+ "data": [
219
+ {{"x": "label or value", "y": numeric_value}}
220
+ ]
221
+ }}
222
+ ],
223
+ "trend": "increasing|decreasing|stable|fluctuating",
224
+ "insights": ["insight 1", "insight 2"]
225
+ }}"""
226
+
227
+
228
+ CHART_QUESTION_PROMPT = """Answer this question about a chart.
229
+
230
+ CHART INFORMATION:
231
+ {chart_info}
232
+
233
+ QUESTION: {question}
234
+
235
+ Provide a specific, data-driven answer based on the chart information.
236
+ If the answer requires reading specific values, provide them.
237
+ If the answer involves a trend or comparison, explain your reasoning.
238
+
239
+ Answer:"""
240
+
241
+
242
+ # =============================================================================
243
+ # Chart Parser
244
+ # =============================================================================
245
+
246
+
247
+ class ChartParser:
248
+ """
249
+ Parses charts from images using LLM vision capabilities.
250
+
251
+ Flow:
252
+ 1. Detect chart in image
253
+ 2. Use LLM to interpret chart contents
254
+ 3. Extract structured data
255
+ 4. Analyze trends
256
+ """
257
+
258
+ def __init__(
259
+ self,
260
+ llm_fn: Callable[[str], str] | None = None,
261
+ vision_fn: Callable[[str, bytes], str] | None = None,
262
+ ):
263
+ """
264
+ Initialize the chart parser.
265
+
266
+ Args:
267
+ llm_fn: LLM function for text interpretation.
268
+ vision_fn: Vision LLM function for image analysis.
269
+ """
270
+ self.llm_fn = llm_fn
271
+ self.vision_fn = vision_fn
272
+
273
+ def set_llm_function(self, llm_fn: Callable[[str], str]) -> None:
274
+ """Set the LLM function."""
275
+ self.llm_fn = llm_fn
276
+
277
+ def set_vision_function(self, vision_fn: Callable[[str, bytes], str]) -> None:
278
+ """Set the vision function."""
279
+ self.vision_fn = vision_fn
280
+
281
+ def parse_from_image(
282
+ self,
283
+ image_bytes: bytes,
284
+ doc_id: str = "",
285
+ page_num: int | None = None,
286
+ node_id: str = "",
287
+ ) -> ParsedChart | None:
288
+ """
289
+ Parse a chart from an image.
290
+
291
+ Args:
292
+ image_bytes: Image data.
293
+ doc_id: Document ID.
294
+ page_num: Page number.
295
+ node_id: Node ID.
296
+
297
+ Returns:
298
+ ParsedChart or None if parsing fails.
299
+ """
300
+ if not self.vision_fn:
301
+ logger.warning("no_vision_function_configured")
302
+ return None
303
+
304
+ try:
305
+ # Use vision LLM to interpret the chart
306
+ response = self.vision_fn(CHART_INTERPRETATION_PROMPT, image_bytes)
307
+
308
+ # Parse the response
309
+ chart = self._parse_interpretation(response)
310
+
311
+ if chart:
312
+ chart.doc_id = doc_id
313
+ chart.page_num = page_num
314
+ chart.node_id = node_id
315
+
316
+ # Analyze the chart
317
+ chart.analysis = self._analyze_chart(chart)
318
+
319
+ logger.info(
320
+ "chart_parsed",
321
+ chart_type=chart.chart_type.value,
322
+ series=len(chart.series),
323
+ )
324
+
325
+ return chart
326
+
327
+ except Exception as e:
328
+ logger.warning("chart_parsing_failed", error=str(e))
329
+ return None
330
+
331
+ def _parse_interpretation(self, response: str) -> ParsedChart | None:
332
+ """Parse LLM interpretation into structured chart."""
333
+ try:
334
+ json_match = re.search(r'\{[\s\S]*\}', response)
335
+ if not json_match:
336
+ return None
337
+
338
+ data = json.loads(json_match.group())
339
+
340
+ chart = ParsedChart(
341
+ title=data.get("title", ""),
342
+ x_axis_label=data.get("x_axis", {}).get("label", ""),
343
+ x_axis_unit=data.get("x_axis", {}).get("unit", ""),
344
+ y_axis_label=data.get("y_axis", {}).get("label", ""),
345
+ y_axis_unit=data.get("y_axis", {}).get("unit", ""),
346
+ )
347
+
348
+ # Parse chart type
349
+ try:
350
+ chart.chart_type = ChartType(data.get("chart_type", "unknown"))
351
+ except ValueError:
352
+ chart.chart_type = ChartType.UNKNOWN
353
+
354
+ # Parse series
355
+ for series_data in data.get("series", []):
356
+ series = ChartSeries(name=series_data.get("name", ""))
357
+
358
+ for point in series_data.get("data", []):
359
+ dp = DataPoint(
360
+ x=point.get("x", ""),
361
+ y=float(point.get("y", 0)),
362
+ )
363
+ series.data_points.append(dp)
364
+
365
+ chart.series.append(series)
366
+
367
+ # Store insights in analysis
368
+ insights = data.get("insights", [])
369
+ trend = data.get("trend", "")
370
+
371
+ if insights or trend:
372
+ chart.analysis = ChartAnalysis(
373
+ trend=trend,
374
+ key_insights=insights,
375
+ )
376
+
377
+ return chart
378
+
379
+ except (json.JSONDecodeError, ValueError) as e:
380
+ logger.warning("chart_interpretation_parse_failed", error=str(e))
381
+ return None
382
+
383
+ def _analyze_chart(self, chart: ParsedChart) -> ChartAnalysis:
384
+ """Analyze chart data for trends and statistics."""
385
+ analysis = chart.analysis or ChartAnalysis()
386
+
387
+ all_values = chart.get_all_values()
388
+
389
+ if not all_values:
390
+ return analysis
391
+
392
+ # Basic statistics
393
+ analysis.min_value = min(all_values)
394
+ analysis.max_value = max(all_values)
395
+ analysis.avg_value = sum(all_values) / len(all_values)
396
+
397
+ # Trend detection (for time series)
398
+ if len(all_values) >= 3 and not analysis.trend:
399
+ # Simple trend: compare first third to last third
400
+ third = len(all_values) // 3
401
+ first_avg = sum(all_values[:third]) / third if third > 0 else 0
402
+ last_avg = sum(all_values[-third:]) / third if third > 0 else 0
403
+
404
+ if last_avg > first_avg * 1.1:
405
+ analysis.trend = "increasing"
406
+ elif last_avg < first_avg * 0.9:
407
+ analysis.trend = "decreasing"
408
+ else:
409
+ # Check for fluctuation
410
+ std_dev = (sum((v - analysis.avg_value) ** 2 for v in all_values) / len(all_values)) ** 0.5
411
+ if std_dev > analysis.avg_value * 0.2:
412
+ analysis.trend = "fluctuating"
413
+ else:
414
+ analysis.trend = "stable"
415
+
416
+ # Generate trend description
417
+ if analysis.trend == "increasing":
418
+ change = ((all_values[-1] - all_values[0]) / all_values[0] * 100) if all_values[0] != 0 else 0
419
+ analysis.trend_description = f"Values show an increasing trend with approximately {change:.1f}% growth"
420
+ elif analysis.trend == "decreasing":
421
+ change = ((all_values[0] - all_values[-1]) / all_values[0] * 100) if all_values[0] != 0 else 0
422
+ analysis.trend_description = f"Values show a decreasing trend with approximately {change:.1f}% decline"
423
+ elif analysis.trend == "stable":
424
+ analysis.trend_description = f"Values remain relatively stable around {analysis.avg_value:.2f}"
425
+ elif analysis.trend == "fluctuating":
426
+ analysis.trend_description = f"Values show significant fluctuation between {analysis.min_value:.2f} and {analysis.max_value:.2f}"
427
+
428
+ return analysis
429
+
430
+ def parse_from_description(
431
+ self,
432
+ description: str,
433
+ doc_id: str = "",
434
+ page_num: int | None = None,
435
+ ) -> ParsedChart | None:
436
+ """
437
+ Parse chart from text description.
438
+
439
+ Useful when chart image is not available but description exists.
440
+
441
+ Args:
442
+ description: Text description of the chart.
443
+ doc_id: Document ID.
444
+ page_num: Page number.
445
+
446
+ Returns:
447
+ ParsedChart with extracted information.
448
+ """
449
+ chart = ParsedChart(
450
+ doc_id=doc_id,
451
+ page_num=page_num,
452
+ description=description,
453
+ )
454
+
455
+ # Try to extract chart type from description
456
+ description_lower = description.lower()
457
+
458
+ if "bar chart" in description_lower or "bar graph" in description_lower:
459
+ chart.chart_type = ChartType.BAR
460
+ elif "line chart" in description_lower or "line graph" in description_lower:
461
+ chart.chart_type = ChartType.LINE
462
+ elif "pie chart" in description_lower:
463
+ chart.chart_type = ChartType.PIE
464
+ elif "scatter" in description_lower:
465
+ chart.chart_type = ChartType.SCATTER
466
+
467
+ # Try to extract numbers as data points
468
+ numbers = re.findall(r'(\$?[\d,]+\.?\d*%?)', description)
469
+ if numbers:
470
+ series = ChartSeries(name="Extracted values")
471
+ for i, num in enumerate(numbers[:10]): # Limit to first 10
472
+ try:
473
+ value = float(num.replace(',', '').replace('$', '').replace('%', ''))
474
+ series.data_points.append(DataPoint(x=i, y=value))
475
+ except ValueError:
476
+ pass
477
+
478
+ if series.data_points:
479
+ chart.series.append(series)
480
+
481
+ return chart
482
+
483
+ def answer_question(
484
+ self,
485
+ chart: ParsedChart,
486
+ question: str,
487
+ ) -> str:
488
+ """
489
+ Answer a question about a chart.
490
+
491
+ Args:
492
+ chart: The parsed chart.
493
+ question: The question to answer.
494
+
495
+ Returns:
496
+ Answer string.
497
+ """
498
+ if not self.llm_fn:
499
+ return "LLM not configured for chart Q&A"
500
+
501
+ # Build chart info
502
+ chart_info = chart.summarize()
503
+
504
+ if chart.series:
505
+ chart_info += "\n\nData:\n"
506
+ for series in chart.series:
507
+ if series.name:
508
+ chart_info += f"\n{series.name}:\n"
509
+ for dp in series.data_points[:20]: # Limit data points
510
+ chart_info += f" {dp.x}: {dp.y}\n"
511
+
512
+ if chart.analysis:
513
+ chart_info += f"\n\nAnalysis:\n{chart.analysis.trend_description}"
514
+ if chart.analysis.key_insights:
515
+ chart_info += "\n\nInsights:\n" + "\n".join(
516
+ f"- {insight}" for insight in chart.analysis.key_insights
517
+ )
518
+
519
+ prompt = CHART_QUESTION_PROMPT.format(
520
+ chart_info=chart_info,
521
+ question=question,
522
+ )
523
+
524
+ try:
525
+ return self.llm_fn(prompt)
526
+ except Exception as e:
527
+ logger.warning("chart_qa_failed", error=str(e))
528
+ return f"Error answering question: {str(e)}"
529
+
530
+
531
+ # =============================================================================
532
+ # Convenience Functions
533
+ # =============================================================================
534
+
535
+
536
+ def describe_chart(
537
+ chart: ParsedChart,
538
+ include_data: bool = True,
539
+ ) -> str:
540
+ """Generate a natural language description of a chart."""
541
+ parts = []
542
+
543
+ if chart.title:
544
+ parts.append(f"This is a {chart.chart_type.value} chart titled '{chart.title}'.")
545
+ else:
546
+ parts.append(f"This is a {chart.chart_type.value} chart.")
547
+
548
+ if chart.x_axis_label or chart.y_axis_label:
549
+ parts.append(f"The X-axis shows {chart.x_axis_label or 'values'} and the Y-axis shows {chart.y_axis_label or 'values'}.")
550
+
551
+ if chart.series:
552
+ if len(chart.series) == 1:
553
+ parts.append(f"It contains {len(chart.series[0].data_points)} data points.")
554
+ else:
555
+ parts.append(f"It contains {len(chart.series)} data series.")
556
+
557
+ if include_data and chart.series[0].data_points:
558
+ sample = chart.series[0].data_points[:3]
559
+ sample_str = ", ".join([f"{dp.x}: {dp.y}" for dp in sample])
560
+ parts.append(f"Sample data: {sample_str}...")
561
+
562
+ if chart.analysis:
563
+ if chart.analysis.trend_description:
564
+ parts.append(chart.analysis.trend_description)
565
+
566
+ if chart.analysis.key_insights:
567
+ parts.append("Key insights: " + "; ".join(chart.analysis.key_insights[:2]))
568
+
569
+ return " ".join(parts)