unique_toolkit 1.42.9__py3-none-any.whl → 1.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/_common/experimental/write_up_agent/README.md +848 -0
- unique_toolkit/_common/experimental/write_up_agent/__init__.py +22 -0
- unique_toolkit/_common/experimental/write_up_agent/agent.py +170 -0
- unique_toolkit/_common/experimental/write_up_agent/config.py +42 -0
- unique_toolkit/_common/experimental/write_up_agent/examples/data.csv +13 -0
- unique_toolkit/_common/experimental/write_up_agent/examples/example_usage.py +78 -0
- unique_toolkit/_common/experimental/write_up_agent/schemas.py +36 -0
- unique_toolkit/_common/experimental/write_up_agent/services/__init__.py +13 -0
- unique_toolkit/_common/experimental/write_up_agent/services/dataframe_handler/__init__.py +19 -0
- unique_toolkit/_common/experimental/write_up_agent/services/dataframe_handler/exceptions.py +29 -0
- unique_toolkit/_common/experimental/write_up_agent/services/dataframe_handler/service.py +150 -0
- unique_toolkit/_common/experimental/write_up_agent/services/dataframe_handler/utils.py +130 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/__init__.py +27 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/config.py +56 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/exceptions.py +79 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/prompts/config.py +34 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/prompts/system_prompt.j2 +15 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/prompts/user_prompt.j2 +21 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/service.py +369 -0
- unique_toolkit/_common/experimental/write_up_agent/services/template_handler/__init__.py +29 -0
- unique_toolkit/_common/experimental/write_up_agent/services/template_handler/default_template.j2 +37 -0
- unique_toolkit/_common/experimental/write_up_agent/services/template_handler/exceptions.py +39 -0
- unique_toolkit/_common/experimental/write_up_agent/services/template_handler/service.py +191 -0
- unique_toolkit/_common/experimental/write_up_agent/services/template_handler/utils.py +182 -0
- unique_toolkit/_common/experimental/write_up_agent/utils.py +24 -0
- {unique_toolkit-1.42.9.dist-info → unique_toolkit-1.43.1.dist-info}/METADATA +7 -1
- {unique_toolkit-1.42.9.dist-info → unique_toolkit-1.43.1.dist-info}/RECORD +29 -4
- {unique_toolkit-1.42.9.dist-info → unique_toolkit-1.43.1.dist-info}/LICENSE +0 -0
- {unique_toolkit-1.42.9.dist-info → unique_toolkit-1.43.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Write-Up Agent: Template-driven DataFrame summarization and report generation.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from unique_toolkit._common.experimental.write_up_agent.agent import WriteUpAgent
|
|
6
|
+
from unique_toolkit._common.experimental.write_up_agent.config import (
|
|
7
|
+
WriteUpAgentConfig,
|
|
8
|
+
)
|
|
9
|
+
from unique_toolkit._common.experimental.write_up_agent.schemas import (
|
|
10
|
+
GroupData,
|
|
11
|
+
ProcessedGroup,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
# Main agent
|
|
16
|
+
"WriteUpAgent",
|
|
17
|
+
# Configuration
|
|
18
|
+
"WriteUpAgentConfig",
|
|
19
|
+
# Data schemas
|
|
20
|
+
"GroupData",
|
|
21
|
+
"ProcessedGroup",
|
|
22
|
+
]
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Write-Up Agent - Main pipeline orchestrator.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from unique_toolkit._common.experimental.write_up_agent.config import (
|
|
10
|
+
WriteUpAgentConfig,
|
|
11
|
+
)
|
|
12
|
+
from unique_toolkit._common.experimental.write_up_agent.schemas import GroupData
|
|
13
|
+
from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler import (
|
|
14
|
+
DataFrameHandler,
|
|
15
|
+
)
|
|
16
|
+
from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler.exceptions import (
|
|
17
|
+
DataFrameGroupingError,
|
|
18
|
+
DataFrameHandlerError,
|
|
19
|
+
DataFrameProcessingError,
|
|
20
|
+
DataFrameValidationError,
|
|
21
|
+
)
|
|
22
|
+
from unique_toolkit._common.experimental.write_up_agent.services.generation_handler import (
|
|
23
|
+
GenerationHandler,
|
|
24
|
+
GenerationHandlerError,
|
|
25
|
+
)
|
|
26
|
+
from unique_toolkit._common.experimental.write_up_agent.services.template_handler import (
|
|
27
|
+
TemplateHandler,
|
|
28
|
+
)
|
|
29
|
+
from unique_toolkit._common.experimental.write_up_agent.services.template_handler.exceptions import (
|
|
30
|
+
ColumnExtractionError,
|
|
31
|
+
TemplateHandlerError,
|
|
32
|
+
TemplateParsingError,
|
|
33
|
+
TemplateRenderingError,
|
|
34
|
+
TemplateStructureError,
|
|
35
|
+
)
|
|
36
|
+
from unique_toolkit.language_model.service import LanguageModelService
|
|
37
|
+
|
|
38
|
+
_LOGGER = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class WriteUpAgent:
|
|
42
|
+
"""
|
|
43
|
+
Main pipeline orchestrator for DataFrame summarization.
|
|
44
|
+
|
|
45
|
+
Orchestrates the complete pipeline:
|
|
46
|
+
1. Extract template info (grouping + columns)
|
|
47
|
+
2. Validate DataFrame
|
|
48
|
+
3. Create groups
|
|
49
|
+
4. Render each group
|
|
50
|
+
5. Process with LLM
|
|
51
|
+
6. Return results
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, config: WriteUpAgentConfig):
|
|
55
|
+
"""
|
|
56
|
+
Initialize WriteUpAgent.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
config: Configuration with template and settings
|
|
60
|
+
"""
|
|
61
|
+
self._config = config
|
|
62
|
+
self._template_handler = TemplateHandler(config.template)
|
|
63
|
+
self._dataframe_handler = DataFrameHandler()
|
|
64
|
+
|
|
65
|
+
# Create generation handler with injected renderer
|
|
66
|
+
def renderer(group_data: GroupData) -> str:
|
|
67
|
+
return self._template_handler.render_group(group_data)
|
|
68
|
+
|
|
69
|
+
# TODO [UN-16142]: Find a better way to inject the renderer
|
|
70
|
+
self._generation_handler = GenerationHandler(
|
|
71
|
+
self._config.generation_handler_config, renderer
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def process(self, df: pd.DataFrame, llm_service: LanguageModelService) -> str:
|
|
75
|
+
"""
|
|
76
|
+
Execute complete pipeline and generate final report.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
df: pandas DataFrame to process
|
|
80
|
+
llm_service: LanguageModelService to use for generating summaries
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Final markdown report as a single string with all groups processed
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
Various handler exceptions if processing fails
|
|
87
|
+
|
|
88
|
+
Example:
|
|
89
|
+
>>> config = WriteUpAgentConfig(template="...", max_rows_per_group=10)
|
|
90
|
+
>>> agent = WriteUpAgent(config)
|
|
91
|
+
>>> report = agent.process(df)
|
|
92
|
+
>>> print(report)
|
|
93
|
+
"""
|
|
94
|
+
# TODO [UN-16142]: Add error handling for each step separately
|
|
95
|
+
try:
|
|
96
|
+
# Step 1: Extract template structure
|
|
97
|
+
_LOGGER.info("Extracting template structure...")
|
|
98
|
+
grouping_column = self._template_handler.get_grouping_column()
|
|
99
|
+
selected_columns = self._template_handler.get_selected_columns()
|
|
100
|
+
_LOGGER.info(f"Detected grouping column: {grouping_column}")
|
|
101
|
+
_LOGGER.info(f"Detected data columns: {selected_columns}")
|
|
102
|
+
|
|
103
|
+
# Step 2: Validate DataFrame
|
|
104
|
+
_LOGGER.info("Validating DataFrame columns...")
|
|
105
|
+
self._dataframe_handler.validate_columns(
|
|
106
|
+
df, grouping_column, selected_columns
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Step 3: Create groups
|
|
110
|
+
_LOGGER.info("Creating groups from DataFrame...")
|
|
111
|
+
groups = self._dataframe_handler.create_groups(
|
|
112
|
+
df, grouping_column, selected_columns
|
|
113
|
+
)
|
|
114
|
+
_LOGGER.info(f"Created {len(groups)} groups")
|
|
115
|
+
|
|
116
|
+
# Step 4: Process groups with GenerationHandler
|
|
117
|
+
_LOGGER.info("Processing groups with GenerationHandler...")
|
|
118
|
+
processed_groups = self._generation_handler.process_groups(
|
|
119
|
+
groups, grouping_column, llm_service
|
|
120
|
+
)
|
|
121
|
+
_LOGGER.info(f"Generation complete for {len(processed_groups)} groups")
|
|
122
|
+
|
|
123
|
+
# Step 5: Render final report with LLM responses
|
|
124
|
+
_LOGGER.info("Rendering final report...")
|
|
125
|
+
|
|
126
|
+
final_report = self._template_handler.render_all_groups(processed_groups)
|
|
127
|
+
|
|
128
|
+
_LOGGER.info(f"Report generated ({len(final_report)} characters)")
|
|
129
|
+
|
|
130
|
+
return final_report
|
|
131
|
+
|
|
132
|
+
except TemplateParsingError as e:
|
|
133
|
+
_LOGGER.error(f"Template parsing failed: {e}")
|
|
134
|
+
raise
|
|
135
|
+
|
|
136
|
+
except TemplateStructureError as e:
|
|
137
|
+
_LOGGER.error(f"Template structure invalid: {e}")
|
|
138
|
+
raise
|
|
139
|
+
|
|
140
|
+
except ColumnExtractionError as e:
|
|
141
|
+
_LOGGER.error(f"Column extraction failed: {e}")
|
|
142
|
+
raise
|
|
143
|
+
|
|
144
|
+
except DataFrameValidationError as e:
|
|
145
|
+
_LOGGER.error(f"DataFrame validation failed: {e}")
|
|
146
|
+
raise
|
|
147
|
+
|
|
148
|
+
except DataFrameGroupingError as e:
|
|
149
|
+
_LOGGER.error(f"DataFrame grouping failed: {e}")
|
|
150
|
+
raise
|
|
151
|
+
|
|
152
|
+
except DataFrameProcessingError as e:
|
|
153
|
+
_LOGGER.error(f"DataFrame processing failed: {e}")
|
|
154
|
+
raise
|
|
155
|
+
|
|
156
|
+
except GenerationHandlerError as e:
|
|
157
|
+
_LOGGER.error(f"Generation failed: {e}")
|
|
158
|
+
raise
|
|
159
|
+
|
|
160
|
+
except TemplateRenderingError as e:
|
|
161
|
+
_LOGGER.error(f"Final rendering failed: {e}")
|
|
162
|
+
raise
|
|
163
|
+
|
|
164
|
+
except (TemplateHandlerError, DataFrameHandlerError) as e:
|
|
165
|
+
_LOGGER.error(f"Handler error: {e}")
|
|
166
|
+
raise
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
_LOGGER.error(f"Unexpected error: {e}", exc_info=True)
|
|
170
|
+
raise
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field, field_validator
|
|
2
|
+
|
|
3
|
+
from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.config import (
|
|
4
|
+
GenerationHandlerConfig,
|
|
5
|
+
)
|
|
6
|
+
from unique_toolkit._common.experimental.write_up_agent.services.template_handler import (
|
|
7
|
+
default_jinja_template_loader,
|
|
8
|
+
)
|
|
9
|
+
from unique_toolkit._common.pydantic_helpers import get_configuration_dict
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class WriteUpAgentConfig(BaseModel):
|
|
13
|
+
"""Configuration for the Write-Up Agent that generates summaries from DataFrame data.
|
|
14
|
+
|
|
15
|
+
The agent uses a Jinja template as the single source of truth for data structure.
|
|
16
|
+
The template is parsed to automatically detect grouping columns and data references.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
model_config = get_configuration_dict()
|
|
20
|
+
|
|
21
|
+
# Template Configuration (single source of truth)
|
|
22
|
+
template: str = Field(
|
|
23
|
+
default_factory=default_jinja_template_loader,
|
|
24
|
+
description=(
|
|
25
|
+
"Jinja template string that defines the structure of the summary. "
|
|
26
|
+
"The template is parsed to automatically detect grouping columns and data references. "
|
|
27
|
+
"If not provided, loads the default Q&A template. "
|
|
28
|
+
"Example: '{% for g in groups %}## {{ g.section }}{% endfor %}'"
|
|
29
|
+
),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
generation_handler_config: GenerationHandlerConfig = Field(
|
|
33
|
+
default_factory=GenerationHandlerConfig,
|
|
34
|
+
description="Configuration for the generation handler.",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
@field_validator("template")
|
|
38
|
+
@classmethod
|
|
39
|
+
def validate_template_not_empty(cls, v: str) -> str:
|
|
40
|
+
if not v.strip():
|
|
41
|
+
raise ValueError("Template must not be empty")
|
|
42
|
+
return v
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
section,question,answer
|
|
2
|
+
Introduction,What is the Write-Up Agent?,The Write-Up Agent is a tool that automatically generates summaries from structured DataFrame data using LLM technology.
|
|
3
|
+
Introduction,Who should use this tool?,Data scientists and analysts who need to convert tabular data into readable reports.
|
|
4
|
+
Introduction,What are the key benefits?,Automated report generation with customizable templates and intelligent summarization.
|
|
5
|
+
Methods,How does the agent process data?,The agent groups data by sections and generates summaries for each group using LLM calls with adaptive batching.
|
|
6
|
+
Methods,What is adaptive batching?,A technique that splits large groups into smaller batches to fit within token limits while maintaining context.
|
|
7
|
+
Methods,Can I customize the output format?,Yes! You can provide custom Jinja templates to control the structure and style of the generated report.
|
|
8
|
+
Results,What level of accuracy can I expect?,The agent leverages state-of-the-art LLMs to produce accurate and contextually relevant summaries.
|
|
9
|
+
Results,How fast is the processing?,Processing speed depends on the LLM provider and the size of your dataset. Batching helps optimize performance.
|
|
10
|
+
Results,Can it handle large datasets?,Yes! The agent automatically batches large groups to handle datasets of any size efficiently.
|
|
11
|
+
Conclusion,Is this production-ready?,Yes! The agent includes robust error handling and type-safe operations using Pydantic schemas.
|
|
12
|
+
Conclusion,Where can I find more examples?,Check the examples directory for additional use cases and custom template examples.
|
|
13
|
+
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Example: Using the Write-Up Agent to generate summaries from DataFrame data.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# TODO [UN-16142]: Add example usage in tutorial instead of here
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from unique_toolkit._common.experimental.write_up_agent import (
|
|
13
|
+
WriteUpAgent,
|
|
14
|
+
WriteUpAgentConfig,
|
|
15
|
+
)
|
|
16
|
+
from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.config import (
|
|
17
|
+
GenerationHandlerConfig,
|
|
18
|
+
)
|
|
19
|
+
from unique_toolkit.app.unique_settings import UniqueSettings
|
|
20
|
+
from unique_toolkit.language_model.service import LanguageModelService
|
|
21
|
+
|
|
22
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
23
|
+
|
|
24
|
+
# Setup paths
|
|
25
|
+
current_dir = Path(__file__).parent
|
|
26
|
+
env_path = current_dir / "unique.env"
|
|
27
|
+
data_path = current_dir / "data.csv"
|
|
28
|
+
|
|
29
|
+
# Initialize SDK with your API keys
|
|
30
|
+
_SETTINGS = UniqueSettings.from_env(env_file=env_path)
|
|
31
|
+
_SETTINGS.init_sdk()
|
|
32
|
+
|
|
33
|
+
# Configure the Write-Up Agent
|
|
34
|
+
# Using default configuration which expects: section, question, answer columns
|
|
35
|
+
write_up_agent_config = WriteUpAgentConfig(
|
|
36
|
+
generation_handler_config=GenerationHandlerConfig(
|
|
37
|
+
# Optional: Customize generation settings
|
|
38
|
+
# max_rows_per_batch=20, # Max rows per batch (default: 20)
|
|
39
|
+
# max_tokens_per_batch=4000, # Max tokens per batch (default: 4000)
|
|
40
|
+
# common_instruction="You are a technical writer...", # Custom system prompt
|
|
41
|
+
# group_specific_instructions={
|
|
42
|
+
# # IMPORTANT: Both column and value must be in snake_case
|
|
43
|
+
# # DataFrame: Section="Introduction" → Key: "section:introduction"
|
|
44
|
+
# "section:introduction": "Be welcoming and engaging",
|
|
45
|
+
# "section:methods": "Be precise and technical"
|
|
46
|
+
# }
|
|
47
|
+
)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Initialize the agent with LLM service
|
|
51
|
+
write_up_agent = WriteUpAgent(
|
|
52
|
+
config=write_up_agent_config,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Load your DataFrame
|
|
56
|
+
# IMPORTANT: DataFrame must have columns: section, question, answer (otherwise adapt the template)
|
|
57
|
+
df = pd.read_csv(data_path)
|
|
58
|
+
|
|
59
|
+
print(f"Processing {len(df)} rows across {df['section'].nunique()} sections...")
|
|
60
|
+
print(f"Columns in DataFrame: {list(df.columns)}")
|
|
61
|
+
print()
|
|
62
|
+
|
|
63
|
+
llm_service = LanguageModelService.from_settings(_SETTINGS)
|
|
64
|
+
|
|
65
|
+
# Generate the report
|
|
66
|
+
report = write_up_agent.process(df, llm_service=llm_service)
|
|
67
|
+
|
|
68
|
+
# Display the result
|
|
69
|
+
print("=" * 80)
|
|
70
|
+
print("GENERATED REPORT")
|
|
71
|
+
print("=" * 80)
|
|
72
|
+
print(report)
|
|
73
|
+
|
|
74
|
+
# Optional: Save to file
|
|
75
|
+
output_path = current_dir / "report.md"
|
|
76
|
+
output_path.write_text(report)
|
|
77
|
+
print()
|
|
78
|
+
print(f"Report saved to: {output_path}")
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Data schemas for the Write-Up Agent."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class GroupData(BaseModel):
|
|
9
|
+
"""
|
|
10
|
+
Represents a group of rows from a DataFrame.
|
|
11
|
+
|
|
12
|
+
This is the core data structure passed between handlers in the pipeline.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
group_key: str = Field(
|
|
16
|
+
...,
|
|
17
|
+
description="The value of the grouping column for this group (e.g., 'Introduction', 'Methods')",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
rows: list[dict[str, Any]] = Field(
|
|
21
|
+
...,
|
|
22
|
+
description="List of row dictionaries containing the selected columns for this group",
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ProcessedGroup(GroupData):
|
|
27
|
+
"""
|
|
28
|
+
Represents a group after LLM processing.
|
|
29
|
+
|
|
30
|
+
Extends GroupData with the LLM-generated response.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
llm_response: str = Field(
|
|
34
|
+
...,
|
|
35
|
+
description="The LLM-generated summary/output for this group",
|
|
36
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Services for the write-up agent pipeline."""
|
|
2
|
+
|
|
3
|
+
from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler import (
|
|
4
|
+
DataFrameHandler,
|
|
5
|
+
)
|
|
6
|
+
from unique_toolkit._common.experimental.write_up_agent.services.template_handler import (
|
|
7
|
+
TemplateHandler,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"DataFrameHandler",
|
|
12
|
+
"TemplateHandler",
|
|
13
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""DataFrame handler module."""
|
|
2
|
+
|
|
3
|
+
from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler.exceptions import (
|
|
4
|
+
DataFrameGroupingError,
|
|
5
|
+
DataFrameHandlerError,
|
|
6
|
+
DataFrameProcessingError,
|
|
7
|
+
DataFrameValidationError,
|
|
8
|
+
)
|
|
9
|
+
from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler.service import (
|
|
10
|
+
DataFrameHandler,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"DataFrameHandler",
|
|
15
|
+
"DataFrameHandlerError",
|
|
16
|
+
"DataFrameValidationError",
|
|
17
|
+
"DataFrameGroupingError",
|
|
18
|
+
"DataFrameProcessingError",
|
|
19
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Exceptions for DataFrame handler operations."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DataFrameHandlerError(Exception):
|
|
5
|
+
"""Base exception for all DataFrame handler errors."""
|
|
6
|
+
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DataFrameValidationError(DataFrameHandlerError):
|
|
11
|
+
"""Raised when DataFrame validation fails (e.g., missing columns)."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, message: str, missing_columns: list[str] | None = None):
|
|
14
|
+
super().__init__(message)
|
|
15
|
+
self.missing_columns = missing_columns or []
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DataFrameGroupingError(DataFrameHandlerError):
|
|
19
|
+
"""Raised when DataFrame grouping operation fails."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, message: str, grouping_column: str | None = None):
|
|
22
|
+
super().__init__(message)
|
|
23
|
+
self.grouping_column = grouping_column
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DataFrameProcessingError(DataFrameHandlerError):
|
|
27
|
+
"""Raised when general DataFrame processing fails."""
|
|
28
|
+
|
|
29
|
+
pass
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""DataFrame handler service."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from unique_toolkit._common.experimental.write_up_agent.schemas import GroupData
|
|
6
|
+
from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler.exceptions import (
|
|
7
|
+
DataFrameGroupingError,
|
|
8
|
+
DataFrameProcessingError,
|
|
9
|
+
DataFrameValidationError,
|
|
10
|
+
)
|
|
11
|
+
from unique_toolkit._common.experimental.write_up_agent.services.dataframe_handler.utils import (
|
|
12
|
+
dataframe_to_dict_records,
|
|
13
|
+
normalize_column_names,
|
|
14
|
+
to_snake_case,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DataFrameHandler:
|
|
19
|
+
"""
|
|
20
|
+
Handles all DataFrame operations.
|
|
21
|
+
|
|
22
|
+
This handler automatically converts all column names to snake_case to ensure
|
|
23
|
+
compatibility with Jinja template syntax. For example:
|
|
24
|
+
- "My Column" becomes "my_column"
|
|
25
|
+
- "UserName" becomes "user_name"
|
|
26
|
+
- "column-name" becomes "column_name"
|
|
27
|
+
|
|
28
|
+
This normalization happens automatically during validation and grouping operations.
|
|
29
|
+
|
|
30
|
+
Responsibilities:
|
|
31
|
+
- Normalize column names to snake_case
|
|
32
|
+
- Validate DataFrame has required columns
|
|
33
|
+
- Create groups from DataFrame
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def validate_columns(
|
|
37
|
+
self, df: pd.DataFrame, grouping_column: str, selected_columns: list[str]
|
|
38
|
+
) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Validate DataFrame has required columns.
|
|
41
|
+
|
|
42
|
+
NOTE: Column names are automatically converted to snake_case before validation.
|
|
43
|
+
Ensure your template uses snake_case column references (e.g., {{ row.my_column }}).
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
df: pandas DataFrame to validate
|
|
47
|
+
grouping_column: Column to group by (should be in snake_case)
|
|
48
|
+
selected_columns: Columns that should exist (should be in snake_case)
|
|
49
|
+
|
|
50
|
+
Raises:
|
|
51
|
+
DataFrameValidationError: If columns are missing after normalization
|
|
52
|
+
|
|
53
|
+
Example:
|
|
54
|
+
>>> df = pd.DataFrame({"My Section": [1], "My Question": [2]})
|
|
55
|
+
>>> handler.validate_columns(df, "my_section", ["my_question"])
|
|
56
|
+
# Validation passes because "My Section" -> "my_section"
|
|
57
|
+
"""
|
|
58
|
+
# Normalize DataFrame columns to snake_case
|
|
59
|
+
normalized_df = normalize_column_names(df)
|
|
60
|
+
|
|
61
|
+
required_columns = {grouping_column} | set(selected_columns)
|
|
62
|
+
missing_columns = required_columns - set(normalized_df.columns)
|
|
63
|
+
|
|
64
|
+
if missing_columns:
|
|
65
|
+
raise DataFrameValidationError(
|
|
66
|
+
f"DataFrame missing required columns after snake_case normalization: {sorted(missing_columns)}. "
|
|
67
|
+
f"Available columns: {sorted(normalized_df.columns)}",
|
|
68
|
+
missing_columns=sorted(missing_columns),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def create_groups(
|
|
72
|
+
self, df: pd.DataFrame, grouping_column: str, selected_columns: list[str]
|
|
73
|
+
) -> list[GroupData]:
|
|
74
|
+
"""
|
|
75
|
+
Create groups from DataFrame.
|
|
76
|
+
|
|
77
|
+
NOTE: Column names are automatically converted to snake_case.
|
|
78
|
+
Group values (group_key) are kept as-is from the DataFrame (NOT normalized).
|
|
79
|
+
|
|
80
|
+
The returned GroupData instances will have:
|
|
81
|
+
- snake_case column names in their rows
|
|
82
|
+
- Original group_key values from DataFrame
|
|
83
|
+
|
|
84
|
+
IMPORTANT: Groups are returned in the order of their first appearance in the DataFrame,
|
|
85
|
+
NOT sorted alphabetically. This preserves the logical flow of your data.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
df: pandas DataFrame to group
|
|
89
|
+
grouping_column: Column to group by (should be in snake_case)
|
|
90
|
+
selected_columns: Columns to include in rows (should be in snake_case)
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of GroupData instances in order of first appearance, each containing
|
|
94
|
+
group_key (in snake_case) and rows with snake_case columns
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
DataFrameGroupingError: If grouping fails
|
|
98
|
+
DataFrameProcessingError: If data processing fails
|
|
99
|
+
|
|
100
|
+
Example:
|
|
101
|
+
>>> df = pd.DataFrame({
|
|
102
|
+
... "My Section": ["Intro", "Methods", "Results", "Intro"],
|
|
103
|
+
... "My Question": ["Q1", "Q2", "Q3", "Q4"],
|
|
104
|
+
... })
|
|
105
|
+
>>> groups = handler.create_groups(df, "my_section", ["my_question"])
|
|
106
|
+
>>> [g.group_key for g in groups]
|
|
107
|
+
['intro', 'methods', 'results'] # Values normalized to snake_case, order preserved
|
|
108
|
+
"""
|
|
109
|
+
# Normalize column names to snake_case
|
|
110
|
+
normalized_df = normalize_column_names(df)
|
|
111
|
+
|
|
112
|
+
if grouping_column not in normalized_df.columns:
|
|
113
|
+
raise DataFrameGroupingError(
|
|
114
|
+
f"Grouping column '{grouping_column}' not found in normalized DataFrame. "
|
|
115
|
+
f"Available columns: {sorted(normalized_df.columns)}",
|
|
116
|
+
grouping_column=grouping_column,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
# Use sort=False to preserve the order of first appearance in the DataFrame
|
|
121
|
+
grouped = normalized_df.groupby(grouping_column, sort=False)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
raise DataFrameGroupingError(
|
|
124
|
+
f"Failed to group DataFrame by '{grouping_column}': {e}",
|
|
125
|
+
grouping_column=grouping_column,
|
|
126
|
+
) from e
|
|
127
|
+
|
|
128
|
+
results = []
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
for group_key, group_df in grouped:
|
|
132
|
+
# Filter columns if specified
|
|
133
|
+
if selected_columns:
|
|
134
|
+
cols_to_use = [c for c in selected_columns if c in group_df.columns]
|
|
135
|
+
limited_df = group_df.loc[:, cols_to_use]
|
|
136
|
+
else:
|
|
137
|
+
limited_df = group_df
|
|
138
|
+
|
|
139
|
+
# Convert to dict records
|
|
140
|
+
rows = dataframe_to_dict_records(limited_df)
|
|
141
|
+
|
|
142
|
+
# Normalize group_key value to snake_case for consistency with template syntax
|
|
143
|
+
normalized_group_key = to_snake_case(str(group_key))
|
|
144
|
+
|
|
145
|
+
# Create GroupData instance with proper typing
|
|
146
|
+
results.append(GroupData(group_key=normalized_group_key, rows=rows))
|
|
147
|
+
except Exception as e:
|
|
148
|
+
raise DataFrameProcessingError(f"Error processing grouped data: {e}") from e
|
|
149
|
+
|
|
150
|
+
return results
|