sutro 0.1.37__py3-none-any.whl → 0.1.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sutro might be problematic. Click here for more details.
- sutro/cli.py +1 -1
- sutro/common.py +220 -0
- sutro/interfaces.py +90 -0
- sutro/sdk.py +333 -579
- sutro/templates/classification.py +117 -0
- sutro/templates/embed.py +53 -0
- sutro/validation.py +60 -0
- {sutro-0.1.37.dist-info → sutro-0.1.38.dist-info}/METADATA +1 -1
- sutro-0.1.38.dist-info/RECORD +12 -0
- sutro-0.1.37.dist-info/RECORD +0 -7
- {sutro-0.1.37.dist-info → sutro-0.1.38.dist-info}/WHEEL +0 -0
- {sutro-0.1.37.dist-info → sutro-0.1.38.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Union, List
|
|
3
|
+
import polars as pl
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from ..common import ModelOptions
|
|
8
|
+
from ..interfaces import BaseSutroClient
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ClassificationTemplates(BaseSutroClient):
|
|
12
|
+
def classify(
|
|
13
|
+
self,
|
|
14
|
+
data: Union[List, pd.DataFrame, pl.DataFrame, str],
|
|
15
|
+
classes: Union[dict[str, str], list[str]],
|
|
16
|
+
model: ModelOptions = "gemma-3-12b-it",
|
|
17
|
+
job_priority: int = 0,
|
|
18
|
+
name: Union[str, List[str]] = None,
|
|
19
|
+
description: Union[str, List[str]] = None,
|
|
20
|
+
output_column: str = "inference_result",
|
|
21
|
+
column: Union[str, List[str]] = None,
|
|
22
|
+
truncate_rows: bool = True,
|
|
23
|
+
include_scratchpad: bool = False,
|
|
24
|
+
):
|
|
25
|
+
"""
|
|
26
|
+
A simple template style function to perform classification on the provided data with Sutro. The intention is that the implemented code should be very easy to extend further, while showing a basic structure for large-scale classification with Sutro.
|
|
27
|
+
|
|
28
|
+
It uses structured outputs with a scratchpad field, enabling the model to reason step-by-step before providing the final classification.
|
|
29
|
+
The method supports various input formats including lists, DataFrames (Polars or Pandas), file paths, and datasets.
|
|
30
|
+
The method will wait for the classification job to complete before returning the results.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
data (Union[List, pd.DataFrame, pl.DataFrame, str]): The data to classify. Each row should contain some text to classifiy that fits into one of the passed in labels.
|
|
34
|
+
classes (Union[dict[str, str], list[str]]): The classification classes. Can be either:
|
|
35
|
+
- A list of class names, ie ["Positive", "Negative", "Neutral"]
|
|
36
|
+
- A dict mapping class labels to descriptions, ie {"Positive": "Expresses satisfaction...", ...}
|
|
37
|
+
Providing descriptions can improve classification accuracy, especially for ambiguous or domain-specific categories.
|
|
38
|
+
model (ModelOptions, optional): The LLM to use. Defaults to "gemma-3-12b-it"; a model chosen for its balance of performance and efficiency, that also retains competency across a broad number of different domains.
|
|
39
|
+
job_priority (int, optional): The priority of the job. Defaults to 0.
|
|
40
|
+
name (Union[str, List[str]], optional): A job name for experiment/metadata tracking purposes. Defaults to None.
|
|
41
|
+
description (Union[str, List[str]], optional): A job description for experiment/metadata tracking purposes. Defaults to None.
|
|
42
|
+
output_column (str, optional): The column name to store the classification results in if the input is a DataFrame. Defaults to "inference_result".
|
|
43
|
+
column (Union[str, List[str]], optional): The column name to use for classification. Required if data is a DataFrame, file path, or dataset. If a list is supplied, it will concatenate the columns of the list into a single column, accepting separator strings.
|
|
44
|
+
truncate_rows (bool, optional): If True, any rows that have a token count exceeding the context window length of the selected model will be truncated to the max length that will fit within the context window. Defaults to True.
|
|
45
|
+
include_scratchpad (bool, optional): If True, includes the model's thinking scratchpad in the output. If False, only returns the final classification. Defaults to False.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
The completed classification results for the provided data. If include_scratchpad is True, returns both scratchpad and classification fields in JSON object. If False, returns only the classification as a string.
|
|
49
|
+
|
|
50
|
+
"""
|
|
51
|
+
if isinstance(classes, dict):
|
|
52
|
+
formatted_classes = "\n".join(
|
|
53
|
+
[f"- {name}: {desc}" for name, desc in classes.items()]
|
|
54
|
+
)
|
|
55
|
+
else:
|
|
56
|
+
formatted_classes = "\n".join([f"- {c}" for c in classes])
|
|
57
|
+
|
|
58
|
+
system_prompt = f"""You are an expert classifier. Your task is to accurately categorize the input into one of the provided classes.
|
|
59
|
+
|
|
60
|
+
## Classes
|
|
61
|
+
|
|
62
|
+
{formatted_classes}
|
|
63
|
+
|
|
64
|
+
## Instructions
|
|
65
|
+
|
|
66
|
+
1. **Analyze the input carefully**: Read and understand the full context - identify key elements, themes, and characteristics
|
|
67
|
+
|
|
68
|
+
2. **Consider each class**: For each possible class, evaluate how similar the input is to its typical characteristics
|
|
69
|
+
|
|
70
|
+
3. **Provide your reasoning in the scratchpad**: Think through which class fits best and why
|
|
71
|
+
|
|
72
|
+
4. **Provide output**: Give your final classification
|
|
73
|
+
|
|
74
|
+
If needed, use the scratchpad field to work through steps 1-3, then provide your final answer in the classification field.
|
|
75
|
+
|
|
76
|
+
## Guidelines
|
|
77
|
+
|
|
78
|
+
- Select exactly ONE class, even if multiple seem applicable (choose the best match)
|
|
79
|
+
- If the input is ambiguous, choose the closest fit and explain your reasoning
|
|
80
|
+
- Base your decision on the actual content, not assumptions or implications
|
|
81
|
+
- Similar inputs should receive the same classification
|
|
82
|
+
|
|
83
|
+
Respond using the structured format with scratchpad and classification fields."""
|
|
84
|
+
|
|
85
|
+
class ClassificationOutput(BaseModel):
|
|
86
|
+
# Since we're using structured outputs, we want to give the model some
|
|
87
|
+
# space to reason and think as needed
|
|
88
|
+
scratchpad: str
|
|
89
|
+
classification: str
|
|
90
|
+
|
|
91
|
+
job_id = self.infer(
|
|
92
|
+
data,
|
|
93
|
+
model,
|
|
94
|
+
name,
|
|
95
|
+
description,
|
|
96
|
+
system_prompt=system_prompt,
|
|
97
|
+
output_schema=ClassificationOutput,
|
|
98
|
+
column=column,
|
|
99
|
+
output_column=output_column,
|
|
100
|
+
job_priority=job_priority,
|
|
101
|
+
truncate_rows=truncate_rows,
|
|
102
|
+
stay_attached=False,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
results = self.await_job_completion(job_id)
|
|
106
|
+
|
|
107
|
+
# Filter out scratchpad if not wanted
|
|
108
|
+
if not include_scratchpad:
|
|
109
|
+
results = results.with_columns(
|
|
110
|
+
pl.col(output_column)
|
|
111
|
+
.map_elements(
|
|
112
|
+
lambda x: json.loads(x)["classification"], return_dtype=pl.Utf8
|
|
113
|
+
)
|
|
114
|
+
.alias(output_column)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return results
|
sutro/templates/embed.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from typing import Union, List
|
|
2
|
+
import polars as pl
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from ..common import EmbeddingModelOptions
|
|
5
|
+
from ..interfaces import BaseSutroClient
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EmbeddingTemplates(BaseSutroClient):
|
|
9
|
+
def embed(
|
|
10
|
+
self,
|
|
11
|
+
data: Union[List, pd.DataFrame, pl.DataFrame, str],
|
|
12
|
+
model: EmbeddingModelOptions = "qwen-3-embedding-0.6b",
|
|
13
|
+
job_priority: int = 0,
|
|
14
|
+
name: Union[str, List[str]] = None,
|
|
15
|
+
description: Union[str, List[str]] = None,
|
|
16
|
+
output_column: str = "inference_result",
|
|
17
|
+
column: Union[str, List[str]] = None,
|
|
18
|
+
truncate_rows: bool = True,
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
A simple template style function to generate embeddings for the provided data, with Sutro. The intention is that the implemented code should be very easy to extend further, while showing a basic structure for large scale embedding generation with Sutro.
|
|
22
|
+
|
|
23
|
+
This method allows you to generate vector embeddings for the provided data using Sutro.
|
|
24
|
+
It supports various options for inputting data, such as lists, DataFrames (Polars or Pandas), file paths and datasets.
|
|
25
|
+
The method will wait for the embedding job to complete before returning the results.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
data (Union[List, pd.DataFrame, pl.DataFrame, str]): The data to generate embeddings for.
|
|
29
|
+
model (ModelOptions, optional): The embedding model to use. Defaults to "qwen-3-embedding-0.6b"; a model we chose as its small & fast, yet performs well on a variety of tasks.
|
|
30
|
+
job_priority (int, optional): The priority of the job. Defaults to 0.
|
|
31
|
+
name (Union[str, List[str]], optional): A job name for experiment/metadata tracking purposes. Defaults to None.
|
|
32
|
+
description (Union[str, List[str]], optional): A job description for experiment/metadata tracking purposes. Defaults to None.
|
|
33
|
+
output_column (str, optional): The column name to store the embedding results in if the input is a DataFrame. Defaults to "inference_result".
|
|
34
|
+
column (Union[str, List[str]], optional): The column name to use for embedding generation. Required if data is a DataFrame, file path, or dataset. If a list is supplied, it will concatenate the columns of the list into a single column, accepting separator strings.
|
|
35
|
+
truncate_rows (bool, optional): If True, any rows that have a token count exceeding the context window length of the selected model will be truncated to the max length that will fit within the context window. Defaults to True.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
The completed embedding results for the provided data.
|
|
39
|
+
|
|
40
|
+
"""
|
|
41
|
+
job_id = self.infer(
|
|
42
|
+
data,
|
|
43
|
+
model,
|
|
44
|
+
name,
|
|
45
|
+
description,
|
|
46
|
+
column,
|
|
47
|
+
output_column,
|
|
48
|
+
job_priority,
|
|
49
|
+
truncate_rows=truncate_rows,
|
|
50
|
+
stay_attached=False,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return self.await_job_completion(job_id)
|
sutro/validation.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import importlib.metadata
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from sutro.common import to_colored_text
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def check_version(package_name: str):
|
|
11
|
+
try:
|
|
12
|
+
# Local version
|
|
13
|
+
local_version = importlib.metadata.version(package_name)
|
|
14
|
+
except importlib.metadata.PackageNotFoundError:
|
|
15
|
+
print(f"{package_name} is not installed.")
|
|
16
|
+
return
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
# Latest release from PyPI
|
|
20
|
+
resp = requests.get(f"https://pypi.org/pypi/{package_name}/json", timeout=2)
|
|
21
|
+
resp.raise_for_status()
|
|
22
|
+
latest_version = resp.json()["info"]["version"]
|
|
23
|
+
|
|
24
|
+
if local_version != latest_version:
|
|
25
|
+
msg = (
|
|
26
|
+
f"⚠️ You are using {package_name} {local_version}, "
|
|
27
|
+
f"but the latest release is {latest_version}. "
|
|
28
|
+
f"Run `[uv] pip install -U {package_name}` to upgrade."
|
|
29
|
+
)
|
|
30
|
+
print(to_colored_text(msg, state="callout"))
|
|
31
|
+
except Exception:
|
|
32
|
+
# Fail silently or log, you don’t want this blocking usage
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def check_for_api_key():
|
|
37
|
+
"""
|
|
38
|
+
Check for an API key in the user's home directory.
|
|
39
|
+
|
|
40
|
+
This method looks for a configuration file named 'config.json' in the
|
|
41
|
+
'.sutro' directory within the user's home directory.
|
|
42
|
+
If the file exists, it attempts to read the API key from it.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
str or None: The API key if found in the configuration file, or None if not found.
|
|
46
|
+
|
|
47
|
+
Note:
|
|
48
|
+
The expected structure of the config.json file is:
|
|
49
|
+
{
|
|
50
|
+
"api_key": "your_api_key_here"
|
|
51
|
+
}
|
|
52
|
+
"""
|
|
53
|
+
CONFIG_DIR = os.path.expanduser("~/.sutro")
|
|
54
|
+
CONFIG_FILE = os.path.join(CONFIG_DIR, "config.json")
|
|
55
|
+
if os.path.exists(CONFIG_FILE):
|
|
56
|
+
with open(CONFIG_FILE, "r") as f:
|
|
57
|
+
config = json.load(f)
|
|
58
|
+
return config.get("api_key")
|
|
59
|
+
else:
|
|
60
|
+
return None
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
sutro/__init__.py,sha256=yUiVwcZ8QamSqDdRHgzoANyTZ-x3cPzlt2Fs5OllR_w,402
|
|
2
|
+
sutro/cli.py,sha256=0NuqRInXA-_7TRw-T0OxP8otmUHUarMtY7kuLbWDous,13751
|
|
3
|
+
sutro/common.py,sha256=FuTYTzy82Ul56r9SVH0XMOqcBBspDAFvrtHM93ZbT_8,6945
|
|
4
|
+
sutro/interfaces.py,sha256=j8k4iEHjHu6HDEb9XqiuJrIRbXhLZi9WsiVmxC97R8s,2972
|
|
5
|
+
sutro/sdk.py,sha256=qhpmQNHZDaeGffPyCopmlc6YQuA1_hLFmuHeQXlNbSM,56107
|
|
6
|
+
sutro/templates/classification.py,sha256=iNFiyuR8bZc9Xe-NdimklpQUveqg3p_eJOlEAHaj7Is,6080
|
|
7
|
+
sutro/templates/embed.py,sha256=csvLA0hw5Qaro_yZvALRRp9_SbfWABFN0iQXrf8E8_I,2941
|
|
8
|
+
sutro/validation.py,sha256=FlFH5e5PAPIPpCrzU7mwfZKDDvrmkHt2yYsFm0Ahfmg,1849
|
|
9
|
+
sutro-0.1.38.dist-info/WHEEL,sha256=X16MKk8bp2DRsAuyteHJ-9qOjzmnY0x1aj0P1ftqqWA,78
|
|
10
|
+
sutro-0.1.38.dist-info/entry_points.txt,sha256=s-dtPZ0AScjvR8S_ykhzXxtVcUjrRlxVxyJymI81A3E,41
|
|
11
|
+
sutro-0.1.38.dist-info/METADATA,sha256=VeFTzSqKUiPkx8Ey-g1DkZ-_EkrLt6BwVVQa7-XC-sw,6259
|
|
12
|
+
sutro-0.1.38.dist-info/RECORD,,
|
sutro-0.1.37.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
sutro/__init__.py,sha256=yUiVwcZ8QamSqDdRHgzoANyTZ-x3cPzlt2Fs5OllR_w,402
|
|
2
|
-
sutro/cli.py,sha256=_FU8PwP4dMzXXg5ldxCXP3kaZvQtOKdA8Kzjc34xmQ0,13727
|
|
3
|
-
sutro/sdk.py,sha256=dysuW6jwtuMjVTdDH1zCoycWLvjzBZa_Mi6dSM_zWpY,63799
|
|
4
|
-
sutro-0.1.37.dist-info/WHEEL,sha256=X16MKk8bp2DRsAuyteHJ-9qOjzmnY0x1aj0P1ftqqWA,78
|
|
5
|
-
sutro-0.1.37.dist-info/entry_points.txt,sha256=s-dtPZ0AScjvR8S_ykhzXxtVcUjrRlxVxyJymI81A3E,41
|
|
6
|
-
sutro-0.1.37.dist-info/METADATA,sha256=pOSPs0yhCpKEhHZJIPIaL-wxSXYoUVBTLQNqN7WjO3E,6259
|
|
7
|
-
sutro-0.1.37.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|