cat-llm 0.0.73__tar.gz → 0.0.75__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-llm
3
- Version: 0.0.73
3
+ Version: 0.0.75
4
4
  Summary: A tool for categorizing text data and images using LLMs and vision models
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-llm#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-llm/issues
@@ -200,6 +200,7 @@ Processes each text response individually, assigning one or more categories from
200
200
  - `safety` (bool, default=False): Enable safety checks on responses and saves to CSV at each API call step
201
201
  - `to_csv` (bool, default=False): Whether to save results to CSV
202
202
  - `chain_of_verification` (bool, default=False): Enable Chain-of-Verification prompting technique for improved accuracy
203
+ - `chain_of_thought` (bool, default=False): Enable Chain-of-Thought prompting technique for improved accuracy
203
204
  - `step_back_prompt` (bool, default=False): Enable step-back prompting to analyze higher-level context before classification
204
205
  - `context_prompt` (bool, default=False): Add expert role and behavioral guidelines to the prompt
205
206
  - `filename` (str, default="categorized_data.csv"): Filename for CSV output
@@ -171,6 +171,7 @@ Processes each text response individually, assigning one or more categories from
171
171
  - `safety` (bool, default=False): Enable safety checks on responses and saves to CSV at each API call step
172
172
  - `to_csv` (bool, default=False): Whether to save results to CSV
173
173
  - `chain_of_verification` (bool, default=False): Enable Chain-of-Verification prompting technique for improved accuracy
174
+ - `chain_of_thought` (bool, default=False): Enable Chain-of-Thought prompting technique for improved accuracy
174
175
  - `step_back_prompt` (bool, default=False): Enable step-back prompting to analyze higher-level context before classification
175
176
  - `context_prompt` (bool, default=False): Add expert role and behavioral guidelines to the prompt
176
177
  - `filename` (str, default="categorized_data.csv"): Filename for CSV output
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: MIT
4
- __version__ = "0.0.73"
4
+ __version__ = "0.0.75"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-llm"
@@ -431,3 +431,77 @@ def chain_of_verification_mistral(
431
431
  print(f"ERROR in Chain of Verification: {str(e)}")
432
432
  print("Falling back to initial response.\n")
433
433
  return initial_reply
434
+
435
+ # openai explore corpus call
436
+ def get_openai_top_n(
437
+ prompt,
438
+ user_model,
439
+ specificity,
440
+ model_source,
441
+ api_key,
442
+ research_question,
443
+ creativity
444
+ ):
445
+ """
446
+ Get response from OpenAI API with system message.
447
+ """
448
+ from openai import OpenAI
449
+
450
+ base_url = (
451
+ "https://api.perplexity.ai" if model_source == "perplexity"
452
+ else "https://router.huggingface.co/v1" if model_source == "huggingface"
453
+ else None
454
+ )
455
+
456
+ client = OpenAI(api_key=api_key, base_url=base_url)
457
+
458
+ response_obj = client.chat.completions.create(
459
+ model=user_model,
460
+ messages=[
461
+ {'role': 'system', 'content': f"""You are a helpful assistant that extracts categories from survey responses. \
462
+ The specific task is to identify {specificity} categories of responses to a survey question. \
463
+ The research question is: {research_question}""" if research_question else "You are a helpful assistant."},
464
+ {'role': 'user', 'content': prompt}
465
+ ],
466
+ **({"temperature": creativity} if creativity is not None else {})
467
+ )
468
+
469
+ return response_obj.choices[0].message.content
470
+
471
+ # anthropic explore corpus call
472
+ def get_anthropic_top_n(
473
+ prompt,
474
+ user_model,
475
+ model_source,
476
+ specificity,
477
+ api_key,
478
+ research_question,
479
+ creativity
480
+ ):
481
+ """
482
+ Get response from Anthropic API with system prompt.
483
+ """
484
+ import anthropic
485
+ client = anthropic.Anthropic(api_key=api_key)
486
+
487
+ # build system prompt
488
+ if research_question:
489
+ system_content = (f"You are a helpful assistant that extracts categories from survey responses. "
490
+ f"The specific task is to identify {specificity} categories of responses to a survey question. "
491
+ f"The research question is: {research_question}")
492
+ else:
493
+ system_content = "You are a helpful assistant."
494
+
495
+ response_obj = client.messages.create(
496
+ model=user_model,
497
+ max_tokens=4096,
498
+ system=system_content,
499
+ messages=[
500
+ {'role': 'user', 'content': prompt}
501
+ ],
502
+ **({"temperature": creativity} if creativity is not None else {})
503
+ )
504
+
505
+ return response_obj.content[0].text
506
+
507
+
@@ -6,7 +6,9 @@ from .calls.all_calls import (
6
6
  chain_of_verification_openai,
7
7
  chain_of_verification_google,
8
8
  chain_of_verification_anthropic,
9
- chain_of_verification_mistral
9
+ chain_of_verification_mistral,
10
+ get_openai_top_n,
11
+ get_anthropic_top_n,
10
12
  )
11
13
 
12
14
 
@@ -117,9 +119,9 @@ def explore_common_categories(
117
119
  survey_question,
118
120
  survey_input,
119
121
  api_key,
120
- top_n=10,
122
+ top_n=12,
121
123
  cat_num=10,
122
- divisions=5,
124
+ divisions=10,
123
125
  user_model="gpt-5",
124
126
  creativity=None,
125
127
  specificity="broad",
@@ -164,20 +166,19 @@ Responses are contained within triple backticks here: ```{survey_participant_chu
164
166
  Number your categories from 1 through {cat_num} and be concise with the category labels and provide no description of the categories."""
165
167
 
166
168
  if model_source == "openai":
167
- client = OpenAI(api_key=api_key)
168
169
  try:
169
- response_obj = client.chat.completions.create(
170
- model=user_model,
171
- messages=[
172
- {'role': 'system', 'content': f"""You are a helpful assistant that extracts categories from survey responses. \
173
- The specific task is to identify {specificity} categories of responses to a survey question. \
174
- The research question is: {research_question}""" if research_question else "You are a helpful assistant."},
175
- {'role': 'user', 'content': prompt}
176
- ],
177
- **({"temperature": creativity} if creativity is not None else {})
170
+ reply = get_openai_top_n(
171
+ prompt=prompt,
172
+ user_model=user_model,
173
+ specificity=specificity,
174
+ api_key=api_key,
175
+ model_source=model_source,
176
+ research_question=research_question,
177
+ creativity=creativity
178
178
  )
179
- reply = response_obj.choices[0].message.content
179
+
180
180
  responses.append(reply)
181
+
181
182
  except BadRequestError as e:
182
183
  if "context_length_exceeded" in str(e) or "maximum context length" in str(e):
183
184
  error_msg = (f"Token limit exceeded for model {user_model}. "
@@ -187,6 +188,20 @@ Number your categories from 1 through {cat_num} and be concise with the category
187
188
  print(f"OpenAI API error: {e}")
188
189
  except Exception as e:
189
190
  print(f"An error occurred: {e}")
191
+
192
+ elif model_source == "anthropic":
193
+
194
+ reply = get_anthropic_top_n(
195
+ prompt=prompt,
196
+ user_model=user_model,
197
+ specificity=specificity,
198
+ model_source=model_source,
199
+ api_key=api_key,
200
+ research_question=research_question,
201
+ creativity=creativity
202
+ )
203
+
204
+ responses.append(reply)
190
205
  else:
191
206
  raise ValueError(f"Unsupported model_source: {model_source}")
192
207
 
@@ -204,24 +219,87 @@ Number your categories from 1 through {cat_num} and be concise with the category
204
219
  flat_list = [item.lower() for sublist in responses_list for item in sublist]
205
220
 
206
221
  #convert flat_list to a df
222
+ def normalize_category(cat):
223
+ if pd.isna(cat):
224
+ return cat
225
+ terms = sorted([term.strip().lower() for term in str(cat).split('/')])
226
+ return '/'.join(terms)
227
+
228
+ # normalized column
207
229
  df = pd.DataFrame(flat_list, columns=['Category'])
208
- counts = pd.Series(flat_list).value_counts() # Use original list before conversion
209
- df['counts'] = df['Category'].map(counts)
210
- df = df.sort_values(by='counts', ascending=False).reset_index(drop=True)
211
- df = df.drop_duplicates(subset='Category', keep='first').reset_index(drop=True)
230
+ df['normalized'] = df['Category'].apply(normalize_category)
231
+
232
+ # group by normalized, count, and keep most frequent original
233
+ result = (df.groupby('normalized')
234
+ .agg(Category=('Category', lambda x: x.value_counts().index[0]),
235
+ counts=('Category', 'size'))
236
+ .sort_values('counts', ascending=False)
237
+ .reset_index(drop=True))
238
+
239
+ df = result
240
+
241
+ second_prompt = f"""You are a data analyst reviewing categorized survey data.
242
+
243
+ Task: From the provided categories, identify and return the top {top_n} CONCEPTUALLY UNIQUE categories.
244
+
245
+ Critical Instructions:
246
+ 1. The categories have already been deduplicated for exact string matches
247
+ 2. However, some categories may still be SEMANTICALLY DUPLICATES (same concept, different wording):
248
+ - "closer to work" and "commute/proximity to work" mean the same thing
249
+ - "breakup/household conflict" and "relationship problems" mean the same thing
250
+ 3. When you identify semantic duplicates:
251
+ - Combine their frequencies mentally
252
+ - Keep the version that appears most frequently OR is most clearly worded
253
+ - Each concept should appear ONLY ONCE in your final list
254
+ 4. Keep category names {specificity}
255
+ 5. Return ONLY a numbered list of {top_n} conceptually unique categories
256
+ 6. No additional text, explanations, or commentary
257
+
258
+ Pre-processed Categories (sorted by frequency):
259
+ {df['Category'].head(top_n * 3).tolist()}
260
+
261
+ Note: More categories than needed are provided so you can identify and merge semantic duplicates.
262
+
263
+ Output Format:
264
+ 1. category name
265
+ 2. category name
266
+ 3. category name
267
+
268
+ Top {top_n} Conceptually Unique Categories:"""
212
269
 
213
- second_prompt = f"""From this list of categories, extract the top {top_n} most common categories. \
214
- The categories are contained within triple backticks here: ```{df['Category'].tolist()}``` \
215
- Return the top {top_n} categories as a numbered list sorted from the most to least common and keep the categories {specificity}, with no additional text or explanation."""
216
270
 
217
271
  if model_source == "openai":
218
- client = OpenAI(api_key=api_key)
272
+
273
+ base_url = (
274
+ "https://api.perplexity.ai" if model_source == "perplexity"
275
+ else "https://router.huggingface.co/v1" if model_source == "huggingface"
276
+ else None
277
+ )
278
+
279
+ client = OpenAI(api_key=api_key, base_url=base_url)
280
+
219
281
  response_obj = client.chat.completions.create(
220
282
  model=user_model,
221
283
  messages=[{'role': 'user', 'content': second_prompt}],
222
- temperature=creativity
284
+ **({"temperature": creativity} if creativity is not None else {})
223
285
  )
224
- top_categories = response_obj.choices[0].message.content
286
+
287
+ top_categories = response_obj.choices[0].message.content
288
+
289
+ elif model_source == "anthropic":
290
+ import anthropic
291
+
292
+ client = anthropic.Anthropic(api_key=api_key)
293
+
294
+ response_obj = client.messages.create(
295
+ model=user_model,
296
+ max_tokens=4096,
297
+ messages=[{'role': 'user', 'content': second_prompt}],
298
+ **({"temperature": creativity} if creativity is not None else {})
299
+ )
300
+
301
+ top_categories = response_obj.content[0].text
302
+
225
303
  print(top_categories)
226
304
 
227
305
  top_categories_final = []
@@ -260,6 +338,7 @@ def multi_class(
260
338
  safety = False,
261
339
  to_csv = False,
262
340
  chain_of_verification = False,
341
+ chain_of_thought = True,
263
342
  step_back_prompt = False,
264
343
  context_prompt = False,
265
344
  filename = "categorized_data.csv",
@@ -397,12 +476,27 @@ def multi_class(
397
476
  extracted_jsons.append(default_json)
398
477
  #print(f"Skipped NaN input.")
399
478
  else:
479
+ if chain_of_thought:
480
+ prompt = f"""{survey_question_context}
481
+
482
+ Categorize this survey response "{response}" into the following categories that apply:
483
+ {categories_str}
484
+
485
+ Let's think step by step:
486
+ 1. First, identify the main themes mentioned in the response
487
+ 2. Then, match each theme to the relevant categories
488
+ 3. Finally, assign 1 to matching categories and 0 to non-matching categories
489
+
490
+ {examples_text}
491
+
492
+ Provide your reasoning for each category, then provide your final answer in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values."""
493
+ else:
400
494
 
401
- prompt = f"""{survey_question_context} \
402
- Categorize this survey response "{response}" into the following categories that apply: \
403
- {categories_str}
404
- {examples_text}
405
- Provide your work in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values."""
495
+ prompt = f"""{survey_question_context} \
496
+ Categorize this survey response "{response}" into the following categories that apply: \
497
+ {categories_str}
498
+ {examples_text}
499
+ Provide your work in JSON format where the number belonging to each category is the key and a 1 if the category is present and a 0 if it is not present as key values."""
406
500
 
407
501
  if context_prompt:
408
502
  context = """You are an expert researcher in survey data categorization.
File without changes
File without changes
File without changes