themefinder 0.6.2__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of themefinder might be problematic. Click here for more details.

@@ -1,17 +1,16 @@
1
1
  import asyncio
2
- import json
3
2
  import logging
4
3
  import os
5
4
  from dataclasses import dataclass
6
5
  from pathlib import Path
7
- from typing import Any, Optional, Type
6
+ from typing import Any, Optional
8
7
 
9
8
  import openai
10
9
  import pandas as pd
11
10
  import tiktoken
12
11
  from langchain_core.prompts import PromptTemplate
13
12
  from langchain_core.runnables import Runnable
14
- from pydantic import BaseModel, ValidationError
13
+ from pydantic import ValidationError
15
14
  from tenacity import (
16
15
  before,
17
16
  before_sleep_log,
@@ -20,7 +19,7 @@ from tenacity import (
20
19
  wait_random_exponential,
21
20
  )
22
21
 
23
- from .themefinder_logging import logger
22
+ from themefinder.themefinder_logging import logger
24
23
 
25
24
 
26
25
  @dataclass
@@ -35,8 +34,8 @@ async def batch_and_run(
35
34
  llm: Runnable,
36
35
  batch_size: int = 10,
37
36
  partition_key: str | None = None,
38
- validation_check: bool = False,
39
- task_validation_model: Type[BaseModel] = None,
37
+ integrity_check: bool = False,
38
+ concurrency: int = 10,
40
39
  **kwargs: Any,
41
40
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
42
41
  """Process a DataFrame of responses in batches using an LLM.
@@ -51,11 +50,11 @@ async def batch_and_run(
51
50
  Defaults to 10.
52
51
  partition_key (str | None, optional): Optional column name to group input rows
53
52
  before batching. Defaults to None.
54
- validation_check (bool, optional): If True, verifies that all input
55
- response IDs are present in LLM output and validates the rows against the validation model,
56
- failed rows are retried individually.
53
+ integrity_check (bool, optional): If True, verifies that all input
54
+ response IDs are present in LLM output.
57
55
  If False, no integrity checking or retrying occurs. Defaults to False.
58
- task_validation_model (Type[BaseModel]): the pydanctic model to validate each row against
56
+ concurrency (int, optional): Maximum number of simultaneous LLM calls allowed.
57
+ Defaults to 10.
59
58
  **kwargs (Any): Additional keyword arguments to pass to the prompt template.
60
59
 
61
60
  Returns:
@@ -80,8 +79,8 @@ async def batch_and_run(
80
79
  processed_rows, failed_ids = await call_llm(
81
80
  batch_prompts=batch_prompts,
82
81
  llm=llm,
83
- validation_check=validation_check,
84
- task_validation_model=task_validation_model,
82
+ integrity_check=integrity_check,
83
+ concurrency=concurrency,
85
84
  )
86
85
  processed_results = process_llm_responses(processed_rows, input_df)
87
86
 
@@ -93,8 +92,8 @@ async def batch_and_run(
93
92
  retry_results, unprocessable_ids = await call_llm(
94
93
  batch_prompts=retry_prompts,
95
94
  llm=llm,
96
- validation_check=validation_check,
97
- task_validation_model=task_validation_model,
95
+ integrity_check=integrity_check,
96
+ concurrency=concurrency,
98
97
  )
99
98
  retry_processed_results = process_llm_responses(retry_results, retry_df)
100
99
  unprocessable_df = retry_df.loc[retry_df["response_id"].isin(unprocessable_ids)]
@@ -287,32 +286,9 @@ async def call_llm(
287
286
  batch_prompts: list[BatchPrompt],
288
287
  llm: Runnable,
289
288
  concurrency: int = 10,
290
- validation_check: bool = False,
291
- task_validation_model: Optional[Type[BaseModel]] = None,
289
+ integrity_check: bool = False,
292
290
  ) -> tuple[list[dict], list[int]]:
293
- """Process multiple batches of prompts concurrently through an LLM with retry logic.
294
-
295
- Args:
296
- batch_prompts (list[BatchPrompt]): List of BatchPrompt objects, each containing a
297
- prompt string and associated response IDs to be processed.
298
- llm (Runnable): LangChain Runnable instance that will process the prompts.
299
- concurrency (int, optional): Maximum number of simultaneous LLM calls allowed.
300
- Defaults to 10.
301
- validation_check (bool, optional): If True, verifies that all input
302
- response IDs are present in the LLM output. Failed batches are discarded and
303
- their IDs are returned for retry. Defaults to False.
304
- task_validation_model (Type[BaseModel]): The Pydantic model to check the LLM outputs against
305
-
306
- Returns:
307
- tuple[list[dict[str, Any]], set[str]]: A tuple containing:
308
- - list of successful LLM responses as dictionaries
309
- - set of failed response IDs (empty if no failures or integrity check is False)
310
-
311
- Notes:
312
- - Uses exponential backoff retry strategy with up to 6 attempts per batch
313
- - Failed batches (when integrity check fails) return None and are filtered out
314
- - Concurrency is managed via asyncio.Semaphore to prevent overwhelming the LLM
315
- """
291
+ """Process multiple batches of prompts concurrently through an LLM with retry logic."""
316
292
  semaphore = asyncio.Semaphore(concurrency)
317
293
 
318
294
  @retry(
@@ -326,24 +302,30 @@ async def call_llm(
326
302
  async with semaphore:
327
303
  try:
328
304
  llm_response = await llm.ainvoke(batch_prompt.prompt_string)
329
- all_results = json.loads(llm_response.content)
330
- except (openai.BadRequestError, json.JSONDecodeError) as e:
331
- failed_ids = batch_prompt.response_ids
305
+ all_results = (
306
+ llm_response.dict()
307
+ if hasattr(llm_response, "dict")
308
+ else llm_response
309
+ )
310
+ responses = (
311
+ all_results["responses"]
312
+ if isinstance(all_results, dict)
313
+ else all_results.responses
314
+ )
315
+ except (openai.BadRequestError, ValueError) as e:
332
316
  logger.warning(e)
333
- return [], failed_ids
317
+ return [], batch_prompt.response_ids
318
+ except ValidationError as e:
319
+ logger.warning(e)
320
+ return [], batch_prompt.response_ids
334
321
 
335
- if validation_check:
322
+ if integrity_check:
336
323
  failed_ids = get_missing_response_ids(
337
324
  batch_prompt.response_ids, all_results
338
325
  )
339
- validated_results, invalid_rows = validate_task_data(
340
- all_results["responses"], task_validation_model
341
- )
342
- failed_ids.extend([r["response_id"] for r in invalid_rows])
343
- return validated_results, failed_ids
326
+ return responses, failed_ids
344
327
  else:
345
- # Flatten the list to align with valid output format
346
- return [r for r in all_results["responses"]], []
328
+ return responses, []
347
329
 
348
330
  results = await asyncio.gather(
349
331
  *[async_llm_call(batch_prompt) for batch_prompt in batch_prompts]
@@ -458,33 +440,3 @@ def build_prompt(
458
440
  )
459
441
  response_ids = input_batch["response_id"].astype(int).to_list()
460
442
  return BatchPrompt(prompt_string=prompt, response_ids=response_ids)
461
-
462
-
463
- def validate_task_data(
464
- task_data: pd.DataFrame | list[dict], task_validation_model: Type[BaseModel] = None
465
- ) -> tuple[list[dict], list[dict]]:
466
- """
467
- Validate each row in task_output against the provided Pydantic model.
468
-
469
- Returns:
470
- valid: a list of validated records (dicts).
471
- invalid: a list of records (dicts) that failed validation.
472
- """
473
-
474
- records = (
475
- task_data.to_dict(orient="records")
476
- if isinstance(task_data, pd.DataFrame)
477
- else task_data
478
- )
479
-
480
- if task_validation_model:
481
- valid_records, invalid_records = [], []
482
- for record in records:
483
- try:
484
- task_validation_model(**record)
485
- valid_records.append(record)
486
- except ValidationError as e:
487
- invalid_records.append(record)
488
- logger.info(f"Failed Validation: {e}")
489
- return valid_records, invalid_records
490
- return records, []