themefinder 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of themefinder might be problematic. Click here for more details.
- themefinder/__init__.py +8 -2
- themefinder/core.py +217 -39
- themefinder/llm_batch_processor.py +33 -81
- themefinder/models.py +371 -94
- themefinder/prompts/agentic_theme_clustering.txt +31 -0
- themefinder/prompts/detail_detection.txt +19 -0
- themefinder/prompts/sentiment_analysis.txt +0 -14
- themefinder/prompts/theme_condensation.txt +2 -22
- themefinder/prompts/theme_generation.txt +6 -38
- themefinder/prompts/theme_mapping.txt +6 -23
- themefinder/prompts/theme_refinement.txt +7 -16
- themefinder/prompts/theme_target_alignment.txt +2 -10
- themefinder/theme_clustering_agent.py +332 -0
- {themefinder-0.6.2.dist-info → themefinder-0.7.0.dist-info}/METADATA +24 -9
- themefinder-0.7.0.dist-info/RECORD +19 -0
- {themefinder-0.6.2.dist-info → themefinder-0.7.0.dist-info}/WHEEL +1 -1
- themefinder-0.6.2.dist-info/RECORD +0 -16
- {themefinder-0.6.2.dist-info → themefinder-0.7.0.dist-info}/LICENCE +0 -0
|
@@ -1,17 +1,16 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import json
|
|
3
2
|
import logging
|
|
4
3
|
import os
|
|
5
4
|
from dataclasses import dataclass
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import Any, Optional
|
|
6
|
+
from typing import Any, Optional
|
|
8
7
|
|
|
9
8
|
import openai
|
|
10
9
|
import pandas as pd
|
|
11
10
|
import tiktoken
|
|
12
11
|
from langchain_core.prompts import PromptTemplate
|
|
13
12
|
from langchain_core.runnables import Runnable
|
|
14
|
-
from pydantic import
|
|
13
|
+
from pydantic import ValidationError
|
|
15
14
|
from tenacity import (
|
|
16
15
|
before,
|
|
17
16
|
before_sleep_log,
|
|
@@ -20,7 +19,7 @@ from tenacity import (
|
|
|
20
19
|
wait_random_exponential,
|
|
21
20
|
)
|
|
22
21
|
|
|
23
|
-
from .themefinder_logging import logger
|
|
22
|
+
from themefinder.themefinder_logging import logger
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
@dataclass
|
|
@@ -35,8 +34,8 @@ async def batch_and_run(
|
|
|
35
34
|
llm: Runnable,
|
|
36
35
|
batch_size: int = 10,
|
|
37
36
|
partition_key: str | None = None,
|
|
38
|
-
|
|
39
|
-
|
|
37
|
+
integrity_check: bool = False,
|
|
38
|
+
concurrency: int = 10,
|
|
40
39
|
**kwargs: Any,
|
|
41
40
|
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
42
41
|
"""Process a DataFrame of responses in batches using an LLM.
|
|
@@ -51,11 +50,11 @@ async def batch_and_run(
|
|
|
51
50
|
Defaults to 10.
|
|
52
51
|
partition_key (str | None, optional): Optional column name to group input rows
|
|
53
52
|
before batching. Defaults to None.
|
|
54
|
-
|
|
55
|
-
response IDs are present in LLM output
|
|
56
|
-
failed rows are retried individually.
|
|
53
|
+
integrity_check (bool, optional): If True, verifies that all input
|
|
54
|
+
response IDs are present in LLM output.
|
|
57
55
|
If False, no integrity checking or retrying occurs. Defaults to False.
|
|
58
|
-
|
|
56
|
+
concurrency (int, optional): Maximum number of simultaneous LLM calls allowed.
|
|
57
|
+
Defaults to 10.
|
|
59
58
|
**kwargs (Any): Additional keyword arguments to pass to the prompt template.
|
|
60
59
|
|
|
61
60
|
Returns:
|
|
@@ -80,8 +79,8 @@ async def batch_and_run(
|
|
|
80
79
|
processed_rows, failed_ids = await call_llm(
|
|
81
80
|
batch_prompts=batch_prompts,
|
|
82
81
|
llm=llm,
|
|
83
|
-
|
|
84
|
-
|
|
82
|
+
integrity_check=integrity_check,
|
|
83
|
+
concurrency=concurrency,
|
|
85
84
|
)
|
|
86
85
|
processed_results = process_llm_responses(processed_rows, input_df)
|
|
87
86
|
|
|
@@ -93,8 +92,8 @@ async def batch_and_run(
|
|
|
93
92
|
retry_results, unprocessable_ids = await call_llm(
|
|
94
93
|
batch_prompts=retry_prompts,
|
|
95
94
|
llm=llm,
|
|
96
|
-
|
|
97
|
-
|
|
95
|
+
integrity_check=integrity_check,
|
|
96
|
+
concurrency=concurrency,
|
|
98
97
|
)
|
|
99
98
|
retry_processed_results = process_llm_responses(retry_results, retry_df)
|
|
100
99
|
unprocessable_df = retry_df.loc[retry_df["response_id"].isin(unprocessable_ids)]
|
|
@@ -287,32 +286,9 @@ async def call_llm(
|
|
|
287
286
|
batch_prompts: list[BatchPrompt],
|
|
288
287
|
llm: Runnable,
|
|
289
288
|
concurrency: int = 10,
|
|
290
|
-
|
|
291
|
-
task_validation_model: Optional[Type[BaseModel]] = None,
|
|
289
|
+
integrity_check: bool = False,
|
|
292
290
|
) -> tuple[list[dict], list[int]]:
|
|
293
|
-
"""Process multiple batches of prompts concurrently through an LLM with retry logic.
|
|
294
|
-
|
|
295
|
-
Args:
|
|
296
|
-
batch_prompts (list[BatchPrompt]): List of BatchPrompt objects, each containing a
|
|
297
|
-
prompt string and associated response IDs to be processed.
|
|
298
|
-
llm (Runnable): LangChain Runnable instance that will process the prompts.
|
|
299
|
-
concurrency (int, optional): Maximum number of simultaneous LLM calls allowed.
|
|
300
|
-
Defaults to 10.
|
|
301
|
-
validation_check (bool, optional): If True, verifies that all input
|
|
302
|
-
response IDs are present in the LLM output. Failed batches are discarded and
|
|
303
|
-
their IDs are returned for retry. Defaults to False.
|
|
304
|
-
task_validation_model (Type[BaseModel]): The Pydantic model to check the LLM outputs against
|
|
305
|
-
|
|
306
|
-
Returns:
|
|
307
|
-
tuple[list[dict[str, Any]], set[str]]: A tuple containing:
|
|
308
|
-
- list of successful LLM responses as dictionaries
|
|
309
|
-
- set of failed response IDs (empty if no failures or integrity check is False)
|
|
310
|
-
|
|
311
|
-
Notes:
|
|
312
|
-
- Uses exponential backoff retry strategy with up to 6 attempts per batch
|
|
313
|
-
- Failed batches (when integrity check fails) return None and are filtered out
|
|
314
|
-
- Concurrency is managed via asyncio.Semaphore to prevent overwhelming the LLM
|
|
315
|
-
"""
|
|
291
|
+
"""Process multiple batches of prompts concurrently through an LLM with retry logic."""
|
|
316
292
|
semaphore = asyncio.Semaphore(concurrency)
|
|
317
293
|
|
|
318
294
|
@retry(
|
|
@@ -326,24 +302,30 @@ async def call_llm(
|
|
|
326
302
|
async with semaphore:
|
|
327
303
|
try:
|
|
328
304
|
llm_response = await llm.ainvoke(batch_prompt.prompt_string)
|
|
329
|
-
all_results =
|
|
330
|
-
|
|
331
|
-
|
|
305
|
+
all_results = (
|
|
306
|
+
llm_response.dict()
|
|
307
|
+
if hasattr(llm_response, "dict")
|
|
308
|
+
else llm_response
|
|
309
|
+
)
|
|
310
|
+
responses = (
|
|
311
|
+
all_results["responses"]
|
|
312
|
+
if isinstance(all_results, dict)
|
|
313
|
+
else all_results.responses
|
|
314
|
+
)
|
|
315
|
+
except (openai.BadRequestError, ValueError) as e:
|
|
332
316
|
logger.warning(e)
|
|
333
|
-
return [],
|
|
317
|
+
return [], batch_prompt.response_ids
|
|
318
|
+
except ValidationError as e:
|
|
319
|
+
logger.warning(e)
|
|
320
|
+
return [], batch_prompt.response_ids
|
|
334
321
|
|
|
335
|
-
if
|
|
322
|
+
if integrity_check:
|
|
336
323
|
failed_ids = get_missing_response_ids(
|
|
337
324
|
batch_prompt.response_ids, all_results
|
|
338
325
|
)
|
|
339
|
-
|
|
340
|
-
all_results["responses"], task_validation_model
|
|
341
|
-
)
|
|
342
|
-
failed_ids.extend([r["response_id"] for r in invalid_rows])
|
|
343
|
-
return validated_results, failed_ids
|
|
326
|
+
return responses, failed_ids
|
|
344
327
|
else:
|
|
345
|
-
|
|
346
|
-
return [r for r in all_results["responses"]], []
|
|
328
|
+
return responses, []
|
|
347
329
|
|
|
348
330
|
results = await asyncio.gather(
|
|
349
331
|
*[async_llm_call(batch_prompt) for batch_prompt in batch_prompts]
|
|
@@ -458,33 +440,3 @@ def build_prompt(
|
|
|
458
440
|
)
|
|
459
441
|
response_ids = input_batch["response_id"].astype(int).to_list()
|
|
460
442
|
return BatchPrompt(prompt_string=prompt, response_ids=response_ids)
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
def validate_task_data(
|
|
464
|
-
task_data: pd.DataFrame | list[dict], task_validation_model: Type[BaseModel] = None
|
|
465
|
-
) -> tuple[list[dict], list[dict]]:
|
|
466
|
-
"""
|
|
467
|
-
Validate each row in task_output against the provided Pydantic model.
|
|
468
|
-
|
|
469
|
-
Returns:
|
|
470
|
-
valid: a list of validated records (dicts).
|
|
471
|
-
invalid: a list of records (dicts) that failed validation.
|
|
472
|
-
"""
|
|
473
|
-
|
|
474
|
-
records = (
|
|
475
|
-
task_data.to_dict(orient="records")
|
|
476
|
-
if isinstance(task_data, pd.DataFrame)
|
|
477
|
-
else task_data
|
|
478
|
-
)
|
|
479
|
-
|
|
480
|
-
if task_validation_model:
|
|
481
|
-
valid_records, invalid_records = [], []
|
|
482
|
-
for record in records:
|
|
483
|
-
try:
|
|
484
|
-
task_validation_model(**record)
|
|
485
|
-
valid_records.append(record)
|
|
486
|
-
except ValidationError as e:
|
|
487
|
-
invalid_records.append(record)
|
|
488
|
-
logger.info(f"Failed Validation: {e}")
|
|
489
|
-
return valid_records, invalid_records
|
|
490
|
-
return records, []
|