unique_toolkit 1.42.9__py3-none-any.whl → 1.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/_common/experimental/write_up_agent/README.md +848 -0
- unique_toolkit/_common/experimental/write_up_agent/__init__.py +22 -0
- unique_toolkit/_common/experimental/write_up_agent/agent.py +170 -0
- unique_toolkit/_common/experimental/write_up_agent/config.py +42 -0
- unique_toolkit/_common/experimental/write_up_agent/examples/data.csv +13 -0
- unique_toolkit/_common/experimental/write_up_agent/examples/example_usage.py +78 -0
- unique_toolkit/_common/experimental/write_up_agent/schemas.py +36 -0
- unique_toolkit/_common/experimental/write_up_agent/services/__init__.py +13 -0
- unique_toolkit/_common/experimental/write_up_agent/services/dataframe_handler/__init__.py +19 -0
- unique_toolkit/_common/experimental/write_up_agent/services/dataframe_handler/exceptions.py +29 -0
- unique_toolkit/_common/experimental/write_up_agent/services/dataframe_handler/service.py +150 -0
- unique_toolkit/_common/experimental/write_up_agent/services/dataframe_handler/utils.py +130 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/__init__.py +27 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/config.py +56 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/exceptions.py +79 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/prompts/config.py +34 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/prompts/system_prompt.j2 +15 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/prompts/user_prompt.j2 +21 -0
- unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/service.py +369 -0
- unique_toolkit/_common/experimental/write_up_agent/services/template_handler/__init__.py +29 -0
- unique_toolkit/_common/experimental/write_up_agent/services/template_handler/default_template.j2 +37 -0
- unique_toolkit/_common/experimental/write_up_agent/services/template_handler/exceptions.py +39 -0
- unique_toolkit/_common/experimental/write_up_agent/services/template_handler/service.py +191 -0
- unique_toolkit/_common/experimental/write_up_agent/services/template_handler/utils.py +182 -0
- unique_toolkit/_common/experimental/write_up_agent/utils.py +24 -0
- {unique_toolkit-1.42.9.dist-info → unique_toolkit-1.43.1.dist-info}/METADATA +7 -1
- {unique_toolkit-1.42.9.dist-info → unique_toolkit-1.43.1.dist-info}/RECORD +29 -4
- {unique_toolkit-1.42.9.dist-info → unique_toolkit-1.43.1.dist-info}/LICENSE +0 -0
- {unique_toolkit-1.42.9.dist-info → unique_toolkit-1.43.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Utility functions for DataFrame operations."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def to_snake_case(text: str) -> str:
|
|
9
|
+
"""
|
|
10
|
+
Convert a string to snake_case.
|
|
11
|
+
|
|
12
|
+
This ensures column names are compatible with Jinja template syntax.
|
|
13
|
+
|
|
14
|
+
Examples:
|
|
15
|
+
>>> to_snake_case("MyColumn")
|
|
16
|
+
'my_column'
|
|
17
|
+
>>> to_snake_case("my_column")
|
|
18
|
+
'my_column'
|
|
19
|
+
>>> to_snake_case("My Column Name")
|
|
20
|
+
'my_column_name'
|
|
21
|
+
>>> to_snake_case("column-name")
|
|
22
|
+
'column_name'
|
|
23
|
+
>>> to_snake_case("Column_123")
|
|
24
|
+
'column_123'
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
text: String to convert
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
snake_case version of the string
|
|
31
|
+
"""
|
|
32
|
+
# Replace spaces and hyphens with underscores
|
|
33
|
+
text = text.replace(" ", "_").replace("-", "_")
|
|
34
|
+
|
|
35
|
+
# Insert underscore before uppercase letters (for camelCase/PascalCase)
|
|
36
|
+
text = re.sub(r"(?<!^)(?=[A-Z])", "_", text)
|
|
37
|
+
|
|
38
|
+
# Convert to lowercase
|
|
39
|
+
text = text.lower()
|
|
40
|
+
|
|
41
|
+
# Remove duplicate underscores
|
|
42
|
+
text = re.sub(r"_+", "_", text)
|
|
43
|
+
|
|
44
|
+
# Remove leading/trailing underscores
|
|
45
|
+
text = text.strip("_")
|
|
46
|
+
|
|
47
|
+
return text
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def from_snake_case_to_display_name(text: str) -> str:
|
|
51
|
+
"""
|
|
52
|
+
Convert snake_case text back to Title Case for display.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
text: snake_case text to convert
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Title Case version of the text
|
|
59
|
+
|
|
60
|
+
Example:
|
|
61
|
+
>>> from_snake_case("executive_summary")
|
|
62
|
+
'Executive Summary'
|
|
63
|
+
>>> from_snake_case("my_column_name")
|
|
64
|
+
'My Column Name'
|
|
65
|
+
>>> from_snake_case("api_design")
|
|
66
|
+
'Api Design'
|
|
67
|
+
"""
|
|
68
|
+
# Split on underscores and capitalize each word
|
|
69
|
+
words = text.split("_")
|
|
70
|
+
return " ".join(word.capitalize() for word in words)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
|
|
74
|
+
"""
|
|
75
|
+
Convert all DataFrame column names to snake_case.
|
|
76
|
+
|
|
77
|
+
This normalization ensures column names are compatible with Jinja template
|
|
78
|
+
syntax (e.g., {{ row.my_column }} works, but {{ row.My Column }} doesn't).
|
|
79
|
+
|
|
80
|
+
Examples:
|
|
81
|
+
>>> df = pd.DataFrame({"My Column": [1], "AnotherColumn": [2]})
|
|
82
|
+
>>> normalized = normalize_column_names(df)
|
|
83
|
+
>>> list(normalized.columns)
|
|
84
|
+
['my_column', 'another_column']
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
df: Input DataFrame
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
New DataFrame with normalized column names
|
|
91
|
+
"""
|
|
92
|
+
## TODO [UN-16142]: Normalization may lead to duplicate column names, we should handle this case
|
|
93
|
+
normalized_columns = {col: to_snake_case(col) for col in df.columns}
|
|
94
|
+
return df.rename(columns=normalized_columns)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def limit_dataframe_rows(df: pd.DataFrame, max_rows: int) -> pd.DataFrame:
|
|
98
|
+
"""
|
|
99
|
+
Limit DataFrame to first N rows.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
df: DataFrame to limit
|
|
103
|
+
max_rows: Maximum number of rows
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
DataFrame with at most max_rows rows
|
|
107
|
+
"""
|
|
108
|
+
if len(df) <= max_rows:
|
|
109
|
+
return df.copy()
|
|
110
|
+
return df.head(max_rows).copy()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def dataframe_to_dict_records(
|
|
114
|
+
df: pd.DataFrame, columns: list[str] | None = None
|
|
115
|
+
) -> list[dict]:
|
|
116
|
+
"""
|
|
117
|
+
Convert DataFrame to list of dict records.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
df: DataFrame to convert
|
|
121
|
+
columns: Optional list of columns to include
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
List of dict records
|
|
125
|
+
"""
|
|
126
|
+
if columns:
|
|
127
|
+
df = df.loc[:, columns]
|
|
128
|
+
|
|
129
|
+
# Replace NaN with None for better serialization
|
|
130
|
+
return df.where(pd.notna(df), None).to_dict(orient="records")
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Generation handler module."""
|
|
2
|
+
|
|
3
|
+
from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.config import (
|
|
4
|
+
GenerationHandlerConfig,
|
|
5
|
+
)
|
|
6
|
+
from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.exceptions import (
|
|
7
|
+
AggregationError,
|
|
8
|
+
BatchCreationError,
|
|
9
|
+
GenerationHandlerError,
|
|
10
|
+
LLMCallError,
|
|
11
|
+
PromptBuildError,
|
|
12
|
+
TokenLimitError,
|
|
13
|
+
)
|
|
14
|
+
from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.service import (
|
|
15
|
+
GenerationHandler,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"GenerationHandler",
|
|
20
|
+
"GenerationHandlerConfig",
|
|
21
|
+
"GenerationHandlerError",
|
|
22
|
+
"BatchCreationError",
|
|
23
|
+
"PromptBuildError",
|
|
24
|
+
"LLMCallError",
|
|
25
|
+
"AggregationError",
|
|
26
|
+
"TokenLimitError",
|
|
27
|
+
]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Configuration for generation handler."""
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from unique_toolkit._common.experimental.write_up_agent.services.generation_handler.prompts.config import (
|
|
6
|
+
GenerationHandlerPromptsConfig,
|
|
7
|
+
)
|
|
8
|
+
from unique_toolkit._common.pydantic_helpers import get_configuration_dict
|
|
9
|
+
from unique_toolkit._common.validators import LMI, get_LMI_default_field
|
|
10
|
+
from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GenerationHandlerConfig(BaseModel):
|
|
14
|
+
"""Configuration for generation handler.
|
|
15
|
+
|
|
16
|
+
This configuration controls how groups are batched, how prompts are built,
|
|
17
|
+
and how the LLM is called for generating summaries.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
model_config = get_configuration_dict()
|
|
21
|
+
|
|
22
|
+
language_model: LMI = get_LMI_default_field(
|
|
23
|
+
DEFAULT_GPT_4o,
|
|
24
|
+
description="The language model to use for generating summaries.",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
common_instruction: str = Field(
|
|
28
|
+
default="You are a technical writer. Summarize the provided content concisely and clearly.",
|
|
29
|
+
description="Common instruction applied to all groups",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# TODO [UN-16142]: Add default instructions for each group
|
|
33
|
+
group_specific_instructions: dict[str, str] = Field(
|
|
34
|
+
default_factory=dict,
|
|
35
|
+
description=(
|
|
36
|
+
"Custom instructions per group. "
|
|
37
|
+
"Keys should be formatted as 'column:value' (e.g., 'section:Introduction')"
|
|
38
|
+
),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
max_tokens_per_batch: int = Field(
|
|
42
|
+
default=4000,
|
|
43
|
+
ge=100,
|
|
44
|
+
description="Maximum tokens per batch for LLM input (affects batching strategy)",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
max_rows_per_batch: int = Field(
|
|
48
|
+
default=20,
|
|
49
|
+
ge=1,
|
|
50
|
+
description="Maximum rows per batch (secondary limit to tokens)",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
prompts_config: GenerationHandlerPromptsConfig = Field(
|
|
54
|
+
default_factory=GenerationHandlerPromptsConfig,
|
|
55
|
+
description="Configuration for the prompts.",
|
|
56
|
+
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Exceptions for generation handler operations."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class GenerationHandlerError(Exception):
|
|
5
|
+
"""Base exception for all generation handler errors."""
|
|
6
|
+
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BatchCreationError(GenerationHandlerError):
|
|
11
|
+
"""Raised when batch creation fails."""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
message: str,
|
|
16
|
+
group_key: str | None = None,
|
|
17
|
+
row_count: int | None = None,
|
|
18
|
+
):
|
|
19
|
+
super().__init__(message)
|
|
20
|
+
self.group_key = group_key
|
|
21
|
+
self.row_count = row_count
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PromptBuildError(GenerationHandlerError):
|
|
25
|
+
"""Raised when prompt building fails."""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
message: str,
|
|
30
|
+
prompt_type: str | None = None,
|
|
31
|
+
context: dict | None = None,
|
|
32
|
+
):
|
|
33
|
+
super().__init__(message)
|
|
34
|
+
self.prompt_type = prompt_type
|
|
35
|
+
self.context = context or {}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class LLMCallError(GenerationHandlerError):
|
|
39
|
+
"""Raised when LLM call fails."""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
message: str,
|
|
44
|
+
group_key: str | None = None,
|
|
45
|
+
batch_index: int | None = None,
|
|
46
|
+
error_details: str | None = None,
|
|
47
|
+
):
|
|
48
|
+
super().__init__(message)
|
|
49
|
+
self.group_key = group_key
|
|
50
|
+
self.batch_index = batch_index
|
|
51
|
+
self.error_details = error_details
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AggregationError(GenerationHandlerError):
|
|
55
|
+
"""Raised when aggregating batch results fails."""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
message: str,
|
|
60
|
+
group_key: str | None = None,
|
|
61
|
+
batch_count: int | None = None,
|
|
62
|
+
):
|
|
63
|
+
super().__init__(message)
|
|
64
|
+
self.group_key = group_key
|
|
65
|
+
self.batch_count = batch_count
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TokenLimitError(GenerationHandlerError):
|
|
69
|
+
"""Raised when token counting or limit validation fails."""
|
|
70
|
+
|
|
71
|
+
def __init__(
|
|
72
|
+
self,
|
|
73
|
+
message: str,
|
|
74
|
+
estimated_tokens: int | None = None,
|
|
75
|
+
max_tokens: int | None = None,
|
|
76
|
+
):
|
|
77
|
+
super().__init__(message)
|
|
78
|
+
self.estimated_tokens = estimated_tokens
|
|
79
|
+
self.max_tokens = max_tokens
|
unique_toolkit/_common/experimental/write_up_agent/services/generation_handler/prompts/config.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Annotated
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
from unique_toolkit._common.experimental.write_up_agent.utils import template_loader
|
|
7
|
+
from unique_toolkit._common.pydantic.rjsf_tags import RJSFMetaTag
|
|
8
|
+
from unique_toolkit._common.pydantic_helpers import get_configuration_dict
|
|
9
|
+
|
|
10
|
+
PARENT_DIR = Path(__file__).parent
|
|
11
|
+
|
|
12
|
+
_SYSTEM_PROMPT_TEMPLATE = template_loader(PARENT_DIR, "system_prompt.j2")
|
|
13
|
+
_USER_PROMPT_TEMPLATE = template_loader(PARENT_DIR, "user_prompt.j2")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GenerationHandlerPromptsConfig(BaseModel):
|
|
17
|
+
model_config = get_configuration_dict()
|
|
18
|
+
|
|
19
|
+
system_prompt_template: Annotated[
|
|
20
|
+
str,
|
|
21
|
+
RJSFMetaTag.StringWidget.textarea(
|
|
22
|
+
rows=len(_SYSTEM_PROMPT_TEMPLATE.split("\n"))
|
|
23
|
+
),
|
|
24
|
+
] = Field(
|
|
25
|
+
default=_SYSTEM_PROMPT_TEMPLATE,
|
|
26
|
+
description="The system prompt for the source selection.",
|
|
27
|
+
)
|
|
28
|
+
user_prompt_template: Annotated[
|
|
29
|
+
str,
|
|
30
|
+
RJSFMetaTag.StringWidget.textarea(rows=len(_USER_PROMPT_TEMPLATE.split("\n"))),
|
|
31
|
+
] = Field(
|
|
32
|
+
default=_USER_PROMPT_TEMPLATE,
|
|
33
|
+
description="The user prompt for the source selection.",
|
|
34
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{# System prompt with common instructions #}
|
|
2
|
+
You are an expert technical writer tasked with creating clear, concise, and well-organized content summaries.
|
|
3
|
+
|
|
4
|
+
## Your Task
|
|
5
|
+
Generate a comprehensive summary for a specific section based on the provided content. The section header is already defined - focus on creating the body content.
|
|
6
|
+
|
|
7
|
+
## Guidelines
|
|
8
|
+
- Synthesize the provided information into a coherent narrative
|
|
9
|
+
- Organize content logically, using sub-sections if it improves clarity
|
|
10
|
+
- Maintain a professional and informative tone
|
|
11
|
+
- Be concise while preserving key information
|
|
12
|
+
- If section-specific instructions are provided, follow them carefully
|
|
13
|
+
|
|
14
|
+
{{ common_instruction }}
|
|
15
|
+
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{# User prompt with section context and optional previous summary #}
|
|
2
|
+
## Section Context
|
|
3
|
+
You are writing content for the section titled: **"{{ section_name }}"**
|
|
4
|
+
|
|
5
|
+
**IMPORTANT**: The section header is already provided in the final report. Do NOT include the section title (# {{ section_name }}) in your response. Start directly with the content.
|
|
6
|
+
|
|
7
|
+
{% if group_instruction %}
|
|
8
|
+
## Section-Specific Instructions
|
|
9
|
+
{{ group_instruction }}
|
|
10
|
+
|
|
11
|
+
{% endif %}
|
|
12
|
+
{% if previous_summary %}
|
|
13
|
+
## Context from Previous Batch
|
|
14
|
+
The following is a summary from the previous batch of content for this section. Build upon this context when generating your summary:
|
|
15
|
+
|
|
16
|
+
{{ previous_summary }}
|
|
17
|
+
|
|
18
|
+
{% endif %}
|
|
19
|
+
## Content to Summarize
|
|
20
|
+
{{ content }}
|
|
21
|
+
|