rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,569 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Chart Parser
|
|
3
|
+
|
|
4
|
+
Extracts and interprets charts from documents.
|
|
5
|
+
Enables answering questions about visual data representations.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- Chart type detection (bar, line, pie, scatter)
|
|
9
|
+
- Data point extraction
|
|
10
|
+
- Trend analysis
|
|
11
|
+
- LLM-based chart interpretation
|
|
12
|
+
|
|
13
|
+
Note: Requires vision capabilities for actual chart parsing.
|
|
14
|
+
This module provides the framework and LLM interpretation layer.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import re
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from enum import Enum
|
|
23
|
+
from typing import Any, Callable
|
|
24
|
+
from uuid import uuid4
|
|
25
|
+
|
|
26
|
+
import structlog
|
|
27
|
+
|
|
28
|
+
logger = structlog.get_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# =============================================================================
|
|
32
|
+
# Data Models
|
|
33
|
+
# =============================================================================
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ChartType(str, Enum):
|
|
37
|
+
"""Types of charts."""
|
|
38
|
+
|
|
39
|
+
BAR = "bar"
|
|
40
|
+
LINE = "line"
|
|
41
|
+
PIE = "pie"
|
|
42
|
+
SCATTER = "scatter"
|
|
43
|
+
AREA = "area"
|
|
44
|
+
HISTOGRAM = "histogram"
|
|
45
|
+
STACKED_BAR = "stacked_bar"
|
|
46
|
+
COMBINATION = "combination"
|
|
47
|
+
UNKNOWN = "unknown"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class DataPoint:
|
|
52
|
+
"""A single data point in a chart."""
|
|
53
|
+
|
|
54
|
+
x: str | float # X-axis value or label
|
|
55
|
+
y: float # Y-axis value
|
|
56
|
+
series: str = "" # Series name (for multi-series charts)
|
|
57
|
+
label: str = "" # Display label
|
|
58
|
+
|
|
59
|
+
def to_dict(self) -> dict[str, Any]:
|
|
60
|
+
"""Convert to dictionary."""
|
|
61
|
+
return {
|
|
62
|
+
"x": self.x,
|
|
63
|
+
"y": self.y,
|
|
64
|
+
"series": self.series,
|
|
65
|
+
"label": self.label,
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class ChartSeries:
|
|
71
|
+
"""A data series in a chart."""
|
|
72
|
+
|
|
73
|
+
name: str = ""
|
|
74
|
+
data_points: list[DataPoint] = field(default_factory=list)
|
|
75
|
+
color: str = ""
|
|
76
|
+
|
|
77
|
+
def to_dict(self) -> dict[str, Any]:
|
|
78
|
+
"""Convert to dictionary."""
|
|
79
|
+
return {
|
|
80
|
+
"name": self.name,
|
|
81
|
+
"data_points": [dp.to_dict() for dp in self.data_points],
|
|
82
|
+
"color": self.color,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
def get_values(self) -> list[float]:
|
|
86
|
+
"""Get all Y values."""
|
|
87
|
+
return [dp.y for dp in self.data_points]
|
|
88
|
+
|
|
89
|
+
def get_labels(self) -> list[str]:
|
|
90
|
+
"""Get all X labels."""
|
|
91
|
+
return [str(dp.x) for dp in self.data_points]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class ChartAnalysis:
|
|
96
|
+
"""Analysis of chart trends and insights."""
|
|
97
|
+
|
|
98
|
+
trend: str = "" # "increasing", "decreasing", "stable", "fluctuating"
|
|
99
|
+
trend_description: str = ""
|
|
100
|
+
min_value: float | None = None
|
|
101
|
+
max_value: float | None = None
|
|
102
|
+
avg_value: float | None = None
|
|
103
|
+
key_insights: list[str] = field(default_factory=list)
|
|
104
|
+
|
|
105
|
+
def to_dict(self) -> dict[str, Any]:
|
|
106
|
+
"""Convert to dictionary."""
|
|
107
|
+
return {
|
|
108
|
+
"trend": self.trend,
|
|
109
|
+
"trend_description": self.trend_description,
|
|
110
|
+
"min_value": self.min_value,
|
|
111
|
+
"max_value": self.max_value,
|
|
112
|
+
"avg_value": self.avg_value,
|
|
113
|
+
"key_insights": self.key_insights,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@dataclass
|
|
118
|
+
class ParsedChart:
|
|
119
|
+
"""A fully parsed chart."""
|
|
120
|
+
|
|
121
|
+
id: str = field(default_factory=lambda: f"chart_{str(uuid4())[:8]}")
|
|
122
|
+
|
|
123
|
+
# Source information
|
|
124
|
+
doc_id: str = ""
|
|
125
|
+
page_num: int | None = None
|
|
126
|
+
node_id: str = ""
|
|
127
|
+
|
|
128
|
+
# Chart metadata
|
|
129
|
+
title: str = ""
|
|
130
|
+
chart_type: ChartType = ChartType.UNKNOWN
|
|
131
|
+
|
|
132
|
+
# Axis information
|
|
133
|
+
x_axis_label: str = ""
|
|
134
|
+
y_axis_label: str = ""
|
|
135
|
+
x_axis_unit: str = ""
|
|
136
|
+
y_axis_unit: str = ""
|
|
137
|
+
|
|
138
|
+
# Data
|
|
139
|
+
series: list[ChartSeries] = field(default_factory=list)
|
|
140
|
+
|
|
141
|
+
# Analysis
|
|
142
|
+
analysis: ChartAnalysis | None = None
|
|
143
|
+
|
|
144
|
+
# LLM interpretation
|
|
145
|
+
description: str = ""
|
|
146
|
+
|
|
147
|
+
def to_dict(self) -> dict[str, Any]:
|
|
148
|
+
"""Convert to dictionary."""
|
|
149
|
+
return {
|
|
150
|
+
"id": self.id,
|
|
151
|
+
"doc_id": self.doc_id,
|
|
152
|
+
"page_num": self.page_num,
|
|
153
|
+
"node_id": self.node_id,
|
|
154
|
+
"title": self.title,
|
|
155
|
+
"chart_type": self.chart_type.value,
|
|
156
|
+
"x_axis_label": self.x_axis_label,
|
|
157
|
+
"y_axis_label": self.y_axis_label,
|
|
158
|
+
"x_axis_unit": self.x_axis_unit,
|
|
159
|
+
"y_axis_unit": self.y_axis_unit,
|
|
160
|
+
"series": [s.to_dict() for s in self.series],
|
|
161
|
+
"analysis": self.analysis.to_dict() if self.analysis else None,
|
|
162
|
+
"description": self.description,
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
def get_all_values(self) -> list[float]:
|
|
166
|
+
"""Get all Y values from all series."""
|
|
167
|
+
values = []
|
|
168
|
+
for series in self.series:
|
|
169
|
+
values.extend(series.get_values())
|
|
170
|
+
return values
|
|
171
|
+
|
|
172
|
+
def summarize(self) -> str:
|
|
173
|
+
"""Generate a text summary of the chart."""
|
|
174
|
+
parts = []
|
|
175
|
+
|
|
176
|
+
if self.title:
|
|
177
|
+
parts.append(f"Chart: {self.title}")
|
|
178
|
+
|
|
179
|
+
parts.append(f"Type: {self.chart_type.value}")
|
|
180
|
+
|
|
181
|
+
if self.series:
|
|
182
|
+
parts.append(f"Series: {len(self.series)}")
|
|
183
|
+
total_points = sum(len(s.data_points) for s in self.series)
|
|
184
|
+
parts.append(f"Data points: {total_points}")
|
|
185
|
+
|
|
186
|
+
if self.analysis:
|
|
187
|
+
if self.analysis.trend:
|
|
188
|
+
parts.append(f"Trend: {self.analysis.trend}")
|
|
189
|
+
if self.analysis.min_value is not None:
|
|
190
|
+
parts.append(f"Range: {self.analysis.min_value:.2f} - {self.analysis.max_value:.2f}")
|
|
191
|
+
|
|
192
|
+
return "\n".join(parts)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# =============================================================================
|
|
196
|
+
# Chart Interpretation Prompts
|
|
197
|
+
# =============================================================================
|
|
198
|
+
|
|
199
|
+
CHART_INTERPRETATION_PROMPT = """Analyze this chart image and extract information.
|
|
200
|
+
|
|
201
|
+
Describe:
|
|
202
|
+
1. CHART TYPE: What type of chart is this? (bar, line, pie, scatter, etc.)
|
|
203
|
+
2. TITLE: What is the chart title?
|
|
204
|
+
3. AXES: What are the X and Y axis labels and units?
|
|
205
|
+
4. DATA: List the data points you can identify (approximate values are fine)
|
|
206
|
+
5. TREND: Is there a trend (increasing, decreasing, stable)?
|
|
207
|
+
6. KEY INSIGHTS: What are the main takeaways from this chart?
|
|
208
|
+
|
|
209
|
+
Respond in JSON:
|
|
210
|
+
{{
|
|
211
|
+
"chart_type": "bar|line|pie|scatter|area|histogram|unknown",
|
|
212
|
+
"title": "...",
|
|
213
|
+
"x_axis": {{"label": "...", "unit": "..."}},
|
|
214
|
+
"y_axis": {{"label": "...", "unit": "..."}},
|
|
215
|
+
"series": [
|
|
216
|
+
{{
|
|
217
|
+
"name": "Series name",
|
|
218
|
+
"data": [
|
|
219
|
+
{{"x": "label or value", "y": numeric_value}}
|
|
220
|
+
]
|
|
221
|
+
}}
|
|
222
|
+
],
|
|
223
|
+
"trend": "increasing|decreasing|stable|fluctuating",
|
|
224
|
+
"insights": ["insight 1", "insight 2"]
|
|
225
|
+
}}"""
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
CHART_QUESTION_PROMPT = """Answer this question about a chart.
|
|
229
|
+
|
|
230
|
+
CHART INFORMATION:
|
|
231
|
+
{chart_info}
|
|
232
|
+
|
|
233
|
+
QUESTION: {question}
|
|
234
|
+
|
|
235
|
+
Provide a specific, data-driven answer based on the chart information.
|
|
236
|
+
If the answer requires reading specific values, provide them.
|
|
237
|
+
If the answer involves a trend or comparison, explain your reasoning.
|
|
238
|
+
|
|
239
|
+
Answer:"""
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
# =============================================================================
|
|
243
|
+
# Chart Parser
|
|
244
|
+
# =============================================================================
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class ChartParser:
|
|
248
|
+
"""
|
|
249
|
+
Parses charts from images using LLM vision capabilities.
|
|
250
|
+
|
|
251
|
+
Flow:
|
|
252
|
+
1. Detect chart in image
|
|
253
|
+
2. Use LLM to interpret chart contents
|
|
254
|
+
3. Extract structured data
|
|
255
|
+
4. Analyze trends
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
def __init__(
|
|
259
|
+
self,
|
|
260
|
+
llm_fn: Callable[[str], str] | None = None,
|
|
261
|
+
vision_fn: Callable[[str, bytes], str] | None = None,
|
|
262
|
+
):
|
|
263
|
+
"""
|
|
264
|
+
Initialize the chart parser.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
llm_fn: LLM function for text interpretation.
|
|
268
|
+
vision_fn: Vision LLM function for image analysis.
|
|
269
|
+
"""
|
|
270
|
+
self.llm_fn = llm_fn
|
|
271
|
+
self.vision_fn = vision_fn
|
|
272
|
+
|
|
273
|
+
def set_llm_function(self, llm_fn: Callable[[str], str]) -> None:
|
|
274
|
+
"""Set the LLM function."""
|
|
275
|
+
self.llm_fn = llm_fn
|
|
276
|
+
|
|
277
|
+
def set_vision_function(self, vision_fn: Callable[[str, bytes], str]) -> None:
|
|
278
|
+
"""Set the vision function."""
|
|
279
|
+
self.vision_fn = vision_fn
|
|
280
|
+
|
|
281
|
+
def parse_from_image(
|
|
282
|
+
self,
|
|
283
|
+
image_bytes: bytes,
|
|
284
|
+
doc_id: str = "",
|
|
285
|
+
page_num: int | None = None,
|
|
286
|
+
node_id: str = "",
|
|
287
|
+
) -> ParsedChart | None:
|
|
288
|
+
"""
|
|
289
|
+
Parse a chart from an image.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
image_bytes: Image data.
|
|
293
|
+
doc_id: Document ID.
|
|
294
|
+
page_num: Page number.
|
|
295
|
+
node_id: Node ID.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
ParsedChart or None if parsing fails.
|
|
299
|
+
"""
|
|
300
|
+
if not self.vision_fn:
|
|
301
|
+
logger.warning("no_vision_function_configured")
|
|
302
|
+
return None
|
|
303
|
+
|
|
304
|
+
try:
|
|
305
|
+
# Use vision LLM to interpret the chart
|
|
306
|
+
response = self.vision_fn(CHART_INTERPRETATION_PROMPT, image_bytes)
|
|
307
|
+
|
|
308
|
+
# Parse the response
|
|
309
|
+
chart = self._parse_interpretation(response)
|
|
310
|
+
|
|
311
|
+
if chart:
|
|
312
|
+
chart.doc_id = doc_id
|
|
313
|
+
chart.page_num = page_num
|
|
314
|
+
chart.node_id = node_id
|
|
315
|
+
|
|
316
|
+
# Analyze the chart
|
|
317
|
+
chart.analysis = self._analyze_chart(chart)
|
|
318
|
+
|
|
319
|
+
logger.info(
|
|
320
|
+
"chart_parsed",
|
|
321
|
+
chart_type=chart.chart_type.value,
|
|
322
|
+
series=len(chart.series),
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
return chart
|
|
326
|
+
|
|
327
|
+
except Exception as e:
|
|
328
|
+
logger.warning("chart_parsing_failed", error=str(e))
|
|
329
|
+
return None
|
|
330
|
+
|
|
331
|
+
def _parse_interpretation(self, response: str) -> ParsedChart | None:
|
|
332
|
+
"""Parse LLM interpretation into structured chart."""
|
|
333
|
+
try:
|
|
334
|
+
json_match = re.search(r'\{[\s\S]*\}', response)
|
|
335
|
+
if not json_match:
|
|
336
|
+
return None
|
|
337
|
+
|
|
338
|
+
data = json.loads(json_match.group())
|
|
339
|
+
|
|
340
|
+
chart = ParsedChart(
|
|
341
|
+
title=data.get("title", ""),
|
|
342
|
+
x_axis_label=data.get("x_axis", {}).get("label", ""),
|
|
343
|
+
x_axis_unit=data.get("x_axis", {}).get("unit", ""),
|
|
344
|
+
y_axis_label=data.get("y_axis", {}).get("label", ""),
|
|
345
|
+
y_axis_unit=data.get("y_axis", {}).get("unit", ""),
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# Parse chart type
|
|
349
|
+
try:
|
|
350
|
+
chart.chart_type = ChartType(data.get("chart_type", "unknown"))
|
|
351
|
+
except ValueError:
|
|
352
|
+
chart.chart_type = ChartType.UNKNOWN
|
|
353
|
+
|
|
354
|
+
# Parse series
|
|
355
|
+
for series_data in data.get("series", []):
|
|
356
|
+
series = ChartSeries(name=series_data.get("name", ""))
|
|
357
|
+
|
|
358
|
+
for point in series_data.get("data", []):
|
|
359
|
+
dp = DataPoint(
|
|
360
|
+
x=point.get("x", ""),
|
|
361
|
+
y=float(point.get("y", 0)),
|
|
362
|
+
)
|
|
363
|
+
series.data_points.append(dp)
|
|
364
|
+
|
|
365
|
+
chart.series.append(series)
|
|
366
|
+
|
|
367
|
+
# Store insights in analysis
|
|
368
|
+
insights = data.get("insights", [])
|
|
369
|
+
trend = data.get("trend", "")
|
|
370
|
+
|
|
371
|
+
if insights or trend:
|
|
372
|
+
chart.analysis = ChartAnalysis(
|
|
373
|
+
trend=trend,
|
|
374
|
+
key_insights=insights,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
return chart
|
|
378
|
+
|
|
379
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
380
|
+
logger.warning("chart_interpretation_parse_failed", error=str(e))
|
|
381
|
+
return None
|
|
382
|
+
|
|
383
|
+
def _analyze_chart(self, chart: ParsedChart) -> ChartAnalysis:
|
|
384
|
+
"""Analyze chart data for trends and statistics."""
|
|
385
|
+
analysis = chart.analysis or ChartAnalysis()
|
|
386
|
+
|
|
387
|
+
all_values = chart.get_all_values()
|
|
388
|
+
|
|
389
|
+
if not all_values:
|
|
390
|
+
return analysis
|
|
391
|
+
|
|
392
|
+
# Basic statistics
|
|
393
|
+
analysis.min_value = min(all_values)
|
|
394
|
+
analysis.max_value = max(all_values)
|
|
395
|
+
analysis.avg_value = sum(all_values) / len(all_values)
|
|
396
|
+
|
|
397
|
+
# Trend detection (for time series)
|
|
398
|
+
if len(all_values) >= 3 and not analysis.trend:
|
|
399
|
+
# Simple trend: compare first third to last third
|
|
400
|
+
third = len(all_values) // 3
|
|
401
|
+
first_avg = sum(all_values[:third]) / third if third > 0 else 0
|
|
402
|
+
last_avg = sum(all_values[-third:]) / third if third > 0 else 0
|
|
403
|
+
|
|
404
|
+
if last_avg > first_avg * 1.1:
|
|
405
|
+
analysis.trend = "increasing"
|
|
406
|
+
elif last_avg < first_avg * 0.9:
|
|
407
|
+
analysis.trend = "decreasing"
|
|
408
|
+
else:
|
|
409
|
+
# Check for fluctuation
|
|
410
|
+
std_dev = (sum((v - analysis.avg_value) ** 2 for v in all_values) / len(all_values)) ** 0.5
|
|
411
|
+
if std_dev > analysis.avg_value * 0.2:
|
|
412
|
+
analysis.trend = "fluctuating"
|
|
413
|
+
else:
|
|
414
|
+
analysis.trend = "stable"
|
|
415
|
+
|
|
416
|
+
# Generate trend description
|
|
417
|
+
if analysis.trend == "increasing":
|
|
418
|
+
change = ((all_values[-1] - all_values[0]) / all_values[0] * 100) if all_values[0] != 0 else 0
|
|
419
|
+
analysis.trend_description = f"Values show an increasing trend with approximately {change:.1f}% growth"
|
|
420
|
+
elif analysis.trend == "decreasing":
|
|
421
|
+
change = ((all_values[0] - all_values[-1]) / all_values[0] * 100) if all_values[0] != 0 else 0
|
|
422
|
+
analysis.trend_description = f"Values show a decreasing trend with approximately {change:.1f}% decline"
|
|
423
|
+
elif analysis.trend == "stable":
|
|
424
|
+
analysis.trend_description = f"Values remain relatively stable around {analysis.avg_value:.2f}"
|
|
425
|
+
elif analysis.trend == "fluctuating":
|
|
426
|
+
analysis.trend_description = f"Values show significant fluctuation between {analysis.min_value:.2f} and {analysis.max_value:.2f}"
|
|
427
|
+
|
|
428
|
+
return analysis
|
|
429
|
+
|
|
430
|
+
def parse_from_description(
|
|
431
|
+
self,
|
|
432
|
+
description: str,
|
|
433
|
+
doc_id: str = "",
|
|
434
|
+
page_num: int | None = None,
|
|
435
|
+
) -> ParsedChart | None:
|
|
436
|
+
"""
|
|
437
|
+
Parse chart from text description.
|
|
438
|
+
|
|
439
|
+
Useful when chart image is not available but description exists.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
description: Text description of the chart.
|
|
443
|
+
doc_id: Document ID.
|
|
444
|
+
page_num: Page number.
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
ParsedChart with extracted information.
|
|
448
|
+
"""
|
|
449
|
+
chart = ParsedChart(
|
|
450
|
+
doc_id=doc_id,
|
|
451
|
+
page_num=page_num,
|
|
452
|
+
description=description,
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
# Try to extract chart type from description
|
|
456
|
+
description_lower = description.lower()
|
|
457
|
+
|
|
458
|
+
if "bar chart" in description_lower or "bar graph" in description_lower:
|
|
459
|
+
chart.chart_type = ChartType.BAR
|
|
460
|
+
elif "line chart" in description_lower or "line graph" in description_lower:
|
|
461
|
+
chart.chart_type = ChartType.LINE
|
|
462
|
+
elif "pie chart" in description_lower:
|
|
463
|
+
chart.chart_type = ChartType.PIE
|
|
464
|
+
elif "scatter" in description_lower:
|
|
465
|
+
chart.chart_type = ChartType.SCATTER
|
|
466
|
+
|
|
467
|
+
# Try to extract numbers as data points
|
|
468
|
+
numbers = re.findall(r'(\$?[\d,]+\.?\d*%?)', description)
|
|
469
|
+
if numbers:
|
|
470
|
+
series = ChartSeries(name="Extracted values")
|
|
471
|
+
for i, num in enumerate(numbers[:10]): # Limit to first 10
|
|
472
|
+
try:
|
|
473
|
+
value = float(num.replace(',', '').replace('$', '').replace('%', ''))
|
|
474
|
+
series.data_points.append(DataPoint(x=i, y=value))
|
|
475
|
+
except ValueError:
|
|
476
|
+
pass
|
|
477
|
+
|
|
478
|
+
if series.data_points:
|
|
479
|
+
chart.series.append(series)
|
|
480
|
+
|
|
481
|
+
return chart
|
|
482
|
+
|
|
483
|
+
def answer_question(
|
|
484
|
+
self,
|
|
485
|
+
chart: ParsedChart,
|
|
486
|
+
question: str,
|
|
487
|
+
) -> str:
|
|
488
|
+
"""
|
|
489
|
+
Answer a question about a chart.
|
|
490
|
+
|
|
491
|
+
Args:
|
|
492
|
+
chart: The parsed chart.
|
|
493
|
+
question: The question to answer.
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
Answer string.
|
|
497
|
+
"""
|
|
498
|
+
if not self.llm_fn:
|
|
499
|
+
return "LLM not configured for chart Q&A"
|
|
500
|
+
|
|
501
|
+
# Build chart info
|
|
502
|
+
chart_info = chart.summarize()
|
|
503
|
+
|
|
504
|
+
if chart.series:
|
|
505
|
+
chart_info += "\n\nData:\n"
|
|
506
|
+
for series in chart.series:
|
|
507
|
+
if series.name:
|
|
508
|
+
chart_info += f"\n{series.name}:\n"
|
|
509
|
+
for dp in series.data_points[:20]: # Limit data points
|
|
510
|
+
chart_info += f" {dp.x}: {dp.y}\n"
|
|
511
|
+
|
|
512
|
+
if chart.analysis:
|
|
513
|
+
chart_info += f"\n\nAnalysis:\n{chart.analysis.trend_description}"
|
|
514
|
+
if chart.analysis.key_insights:
|
|
515
|
+
chart_info += "\n\nInsights:\n" + "\n".join(
|
|
516
|
+
f"- {insight}" for insight in chart.analysis.key_insights
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
prompt = CHART_QUESTION_PROMPT.format(
|
|
520
|
+
chart_info=chart_info,
|
|
521
|
+
question=question,
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
try:
|
|
525
|
+
return self.llm_fn(prompt)
|
|
526
|
+
except Exception as e:
|
|
527
|
+
logger.warning("chart_qa_failed", error=str(e))
|
|
528
|
+
return f"Error answering question: {str(e)}"
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
# =============================================================================
|
|
532
|
+
# Convenience Functions
|
|
533
|
+
# =============================================================================
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def describe_chart(
|
|
537
|
+
chart: ParsedChart,
|
|
538
|
+
include_data: bool = True,
|
|
539
|
+
) -> str:
|
|
540
|
+
"""Generate a natural language description of a chart."""
|
|
541
|
+
parts = []
|
|
542
|
+
|
|
543
|
+
if chart.title:
|
|
544
|
+
parts.append(f"This is a {chart.chart_type.value} chart titled '{chart.title}'.")
|
|
545
|
+
else:
|
|
546
|
+
parts.append(f"This is a {chart.chart_type.value} chart.")
|
|
547
|
+
|
|
548
|
+
if chart.x_axis_label or chart.y_axis_label:
|
|
549
|
+
parts.append(f"The X-axis shows {chart.x_axis_label or 'values'} and the Y-axis shows {chart.y_axis_label or 'values'}.")
|
|
550
|
+
|
|
551
|
+
if chart.series:
|
|
552
|
+
if len(chart.series) == 1:
|
|
553
|
+
parts.append(f"It contains {len(chart.series[0].data_points)} data points.")
|
|
554
|
+
else:
|
|
555
|
+
parts.append(f"It contains {len(chart.series)} data series.")
|
|
556
|
+
|
|
557
|
+
if include_data and chart.series[0].data_points:
|
|
558
|
+
sample = chart.series[0].data_points[:3]
|
|
559
|
+
sample_str = ", ".join([f"{dp.x}: {dp.y}" for dp in sample])
|
|
560
|
+
parts.append(f"Sample data: {sample_str}...")
|
|
561
|
+
|
|
562
|
+
if chart.analysis:
|
|
563
|
+
if chart.analysis.trend_description:
|
|
564
|
+
parts.append(chart.analysis.trend_description)
|
|
565
|
+
|
|
566
|
+
if chart.analysis.key_insights:
|
|
567
|
+
parts.append("Key insights: " + "; ".join(chart.analysis.key_insights[:2]))
|
|
568
|
+
|
|
569
|
+
return " ".join(parts)
|