cat-llm 0.0.74__py3-none-any.whl → 0.0.76__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-llm
3
- Version: 0.0.74
3
+ Version: 0.0.76
4
4
  Summary: A tool for categorizing text data and images using LLMs and vision models
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-llm#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-llm/issues
@@ -1,19 +1,19 @@
1
1
  catllm/CERAD_functions.py,sha256=q4HbP5e2Yu8NnZZ-2eX4sImyj6u3i8xWcq0pYU81iis,22676
2
- catllm/__about__.py,sha256=E0enlOPQDj7XaMZv62lffULZGOUEAqpRIyZ12A6f3zk,430
2
+ catllm/__about__.py,sha256=yXzP4t-1ifCb-n2qXIRLt4j8v0AsNmsQgXS3fMChAzo,430
3
3
  catllm/__init__.py,sha256=sf02zp7N0NW0mAQi7eQ4gliWR1EwoqvXkHN2HwwjcTE,372
4
4
  catllm/build_web_research.py,sha256=880dfE2bEQb-FrXP-42JoLLtyc9ox_sBULDr38xiTiQ,22655
5
5
  catllm/image_functions.py,sha256=8_FftRU285x1HT-AgNkaobefQVD-5q7ZY_t7JFdL3Sg,36177
6
6
  catllm/model_reference_list.py,sha256=37pWwMcgnf4biE3BVRluH5oz2P6ccdJJiCVNHodBH8k,2307
7
- catllm/text_functions.py,sha256=O6wfDh50Xtc0JvQtjWb9L9PgtBP6cjxWBw-PCNmbiaE,33371
7
+ catllm/text_functions.py,sha256=XF6aGuUyihnCKwGnGyLM1PbFQg3fF6nhJ_PoSX2zLaY,36101
8
8
  catllm/calls/CoVe.py,sha256=Y9OGJbaeJ3Odwira92cPXUlnm_ADFqvpOSFSNjFzMMU,10847
9
9
  catllm/calls/__init__.py,sha256=fWuMwLeSGa6zXJYd4s8IyNblsD62G-1NMUsOKrNIkoI,725
10
- catllm/calls/all_calls.py,sha256=E25KpZ_MakMDeCpNCOOM8kQvlfex6UMjnGN1wHkA4AI,14356
10
+ catllm/calls/all_calls.py,sha256=AeN1QocOvL3Z36lDkq6bO0LB3ruz6pXyedvdci0YCxQ,16627
11
11
  catllm/images/circle.png,sha256=JWujAWAh08-TajAoEr_TAeFNLlfbryOLw6cgIBREBuQ,86202
12
12
  catllm/images/cube.png,sha256=nFec3e5bmRe4zrBCJ8QK-HcJLrG7u7dYdKhmdMfacfE,77275
13
13
  catllm/images/diamond.png,sha256=rJDZKtsnBGRO8FPA0iHuA8FvHFGi9PkI_DWSFdw6iv0,99568
14
14
  catllm/images/overlapping_pentagons.png,sha256=VO5plI6eoVRnjfqinn1nNzsCP2WQhuQy71V0EASouW4,71208
15
15
  catllm/images/rectangles.png,sha256=2XM16HO9EYWj2yHgN4bPXaCwPfl7iYQy0tQUGaJX9xg,40692
16
- cat_llm-0.0.74.dist-info/METADATA,sha256=DYaL_OFgi9MuFpWLd1DHgqVi_osTwK1DJH-E5Q2kaa8,23214
17
- cat_llm-0.0.74.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
18
- cat_llm-0.0.74.dist-info/licenses/LICENSE,sha256=Vje2sS5WV4TnIwY5uQHrF4qnBAM3YOk1pGpdH0ot-2o,34969
19
- cat_llm-0.0.74.dist-info/RECORD,,
16
+ cat_llm-0.0.76.dist-info/METADATA,sha256=EGlOhrerEtwgdk98DPhSCSshmOKnhXHw67-25V8wrJs,23214
17
+ cat_llm-0.0.76.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
18
+ cat_llm-0.0.76.dist-info/licenses/LICENSE,sha256=Vje2sS5WV4TnIwY5uQHrF4qnBAM3YOk1pGpdH0ot-2o,34969
19
+ cat_llm-0.0.76.dist-info/RECORD,,
catllm/__about__.py CHANGED
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: MIT
4
- __version__ = "0.0.74"
4
+ __version__ = "0.0.76"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-llm"
catllm/calls/all_calls.py CHANGED
@@ -431,3 +431,77 @@ def chain_of_verification_mistral(
431
431
  print(f"ERROR in Chain of Verification: {str(e)}")
432
432
  print("Falling back to initial response.\n")
433
433
  return initial_reply
434
+
435
+ # openai explore corpus call
436
+ def get_openai_top_n(
437
+ prompt,
438
+ user_model,
439
+ specificity,
440
+ model_source,
441
+ api_key,
442
+ research_question,
443
+ creativity
444
+ ):
445
+ """
446
+ Get response from OpenAI API with system message.
447
+ """
448
+ from openai import OpenAI
449
+
450
+ base_url = (
451
+ "https://api.perplexity.ai" if model_source == "perplexity"
452
+ else "https://router.huggingface.co/v1" if model_source == "huggingface"
453
+ else None
454
+ )
455
+
456
+ client = OpenAI(api_key=api_key, base_url=base_url)
457
+
458
+ response_obj = client.chat.completions.create(
459
+ model=user_model,
460
+ messages=[
461
+ {'role': 'system', 'content': f"""You are a helpful assistant that extracts categories from survey responses. \
462
+ The specific task is to identify {specificity} categories of responses to a survey question. \
463
+ The research question is: {research_question}""" if research_question else "You are a helpful assistant."},
464
+ {'role': 'user', 'content': prompt}
465
+ ],
466
+ **({"temperature": creativity} if creativity is not None else {})
467
+ )
468
+
469
+ return response_obj.choices[0].message.content
470
+
471
+ # anthropic explore corpus call
472
+ def get_anthropic_top_n(
473
+ prompt,
474
+ user_model,
475
+ model_source,
476
+ specificity,
477
+ api_key,
478
+ research_question,
479
+ creativity
480
+ ):
481
+ """
482
+ Get response from Anthropic API with system prompt.
483
+ """
484
+ import anthropic
485
+ client = anthropic.Anthropic(api_key=api_key)
486
+
487
+ # build system prompt
488
+ if research_question:
489
+ system_content = (f"You are a helpful assistant that extracts categories from survey responses. "
490
+ f"The specific task is to identify {specificity} categories of responses to a survey question. "
491
+ f"The research question is: {research_question}")
492
+ else:
493
+ system_content = "You are a helpful assistant."
494
+
495
+ response_obj = client.messages.create(
496
+ model=user_model,
497
+ max_tokens=4096,
498
+ system=system_content,
499
+ messages=[
500
+ {'role': 'user', 'content': prompt}
501
+ ],
502
+ **({"temperature": creativity} if creativity is not None else {})
503
+ )
504
+
505
+ return response_obj.content[0].text
506
+
507
+
catllm/text_functions.py CHANGED
@@ -6,7 +6,9 @@ from .calls.all_calls import (
6
6
  chain_of_verification_openai,
7
7
  chain_of_verification_google,
8
8
  chain_of_verification_anthropic,
9
- chain_of_verification_mistral
9
+ chain_of_verification_mistral,
10
+ get_openai_top_n,
11
+ get_anthropic_top_n,
10
12
  )
11
13
 
12
14
 
@@ -117,9 +119,9 @@ def explore_common_categories(
117
119
  survey_question,
118
120
  survey_input,
119
121
  api_key,
120
- top_n=10,
122
+ top_n=12,
121
123
  cat_num=10,
122
- divisions=5,
124
+ divisions=10,
123
125
  user_model="gpt-5",
124
126
  creativity=None,
125
127
  specificity="broad",
@@ -164,20 +166,19 @@ Responses are contained within triple backticks here: ```{survey_participant_chu
164
166
  Number your categories from 1 through {cat_num} and be concise with the category labels and provide no description of the categories."""
165
167
 
166
168
  if model_source == "openai":
167
- client = OpenAI(api_key=api_key)
168
169
  try:
169
- response_obj = client.chat.completions.create(
170
- model=user_model,
171
- messages=[
172
- {'role': 'system', 'content': f"""You are a helpful assistant that extracts categories from survey responses. \
173
- The specific task is to identify {specificity} categories of responses to a survey question. \
174
- The research question is: {research_question}""" if research_question else "You are a helpful assistant."},
175
- {'role': 'user', 'content': prompt}
176
- ],
177
- **({"temperature": creativity} if creativity is not None else {})
170
+ reply = get_openai_top_n(
171
+ prompt=prompt,
172
+ user_model=user_model,
173
+ specificity=specificity,
174
+ api_key=api_key,
175
+ model_source=model_source,
176
+ research_question=research_question,
177
+ creativity=creativity
178
178
  )
179
- reply = response_obj.choices[0].message.content
179
+
180
180
  responses.append(reply)
181
+
181
182
  except BadRequestError as e:
182
183
  if "context_length_exceeded" in str(e) or "maximum context length" in str(e):
183
184
  error_msg = (f"Token limit exceeded for model {user_model}. "
@@ -187,6 +188,20 @@ Number your categories from 1 through {cat_num} and be concise with the category
187
188
  print(f"OpenAI API error: {e}")
188
189
  except Exception as e:
189
190
  print(f"An error occurred: {e}")
191
+
192
+ elif model_source == "anthropic":
193
+
194
+ reply = get_anthropic_top_n(
195
+ prompt=prompt,
196
+ user_model=user_model,
197
+ specificity=specificity,
198
+ model_source=model_source,
199
+ api_key=api_key,
200
+ research_question=research_question,
201
+ creativity=creativity
202
+ )
203
+
204
+ responses.append(reply)
190
205
  else:
191
206
  raise ValueError(f"Unsupported model_source: {model_source}")
192
207
 
@@ -204,24 +219,87 @@ Number your categories from 1 through {cat_num} and be concise with the category
204
219
  flat_list = [item.lower() for sublist in responses_list for item in sublist]
205
220
 
206
221
  #convert flat_list to a df
222
+ def normalize_category(cat):
223
+ if pd.isna(cat):
224
+ return cat
225
+ terms = sorted([term.strip().lower() for term in str(cat).split('/')])
226
+ return '/'.join(terms)
227
+
228
+ # normalized column
207
229
  df = pd.DataFrame(flat_list, columns=['Category'])
208
- counts = pd.Series(flat_list).value_counts() # Use original list before conversion
209
- df['counts'] = df['Category'].map(counts)
210
- df = df.sort_values(by='counts', ascending=False).reset_index(drop=True)
211
- df = df.drop_duplicates(subset='Category', keep='first').reset_index(drop=True)
230
+ df['normalized'] = df['Category'].apply(normalize_category)
231
+
232
+ # group by normalized, count, and keep most frequent original
233
+ result = (df.groupby('normalized')
234
+ .agg(Category=('Category', lambda x: x.value_counts().index[0]),
235
+ counts=('Category', 'size'))
236
+ .sort_values('counts', ascending=False)
237
+ .reset_index(drop=True))
238
+
239
+ df = result
240
+
241
+ second_prompt = f"""You are a data analyst reviewing categorized survey data.
242
+
243
+ Task: From the provided categories, identify and return the top {top_n} CONCEPTUALLY UNIQUE categories.
244
+
245
+ Critical Instructions:
246
+ 1. The categories have already been deduplicated for exact string matches
247
+ 2. However, some categories may still be SEMANTICALLY DUPLICATES (same concept, different wording):
248
+ - "closer to work" and "commute/proximity to work" mean the same thing
249
+ - "breakup/household conflict" and "relationship problems" mean the same thing
250
+ 3. When you identify semantic duplicates:
251
+ - Combine their frequencies mentally
252
+ - Keep the version that appears most frequently OR is most clearly worded
253
+ - Each concept should appear ONLY ONCE in your final list
254
+ 4. Keep category names {specificity}
255
+ 5. Return ONLY a numbered list of {top_n} conceptually unique categories
256
+ 6. No additional text, explanations, or commentary
257
+
258
+ Pre-processed Categories (sorted by frequency):
259
+ {df['Category'].head(top_n * 3).tolist()}
260
+
261
+ Note: More categories than needed are provided so you can identify and merge semantic duplicates.
262
+
263
+ Output Format:
264
+ 1. category name
265
+ 2. category name
266
+ 3. category name
267
+
268
+ Top {top_n} Conceptually Unique Categories:"""
212
269
 
213
- second_prompt = f"""From this list of categories, extract the top {top_n} most common categories. \
214
- The categories are contained within triple backticks here: ```{df['Category'].tolist()}``` \
215
- Return the top {top_n} categories as a numbered list sorted from the most to least common and keep the categories {specificity}, with no additional text or explanation."""
216
270
 
217
271
  if model_source == "openai":
218
- client = OpenAI(api_key=api_key)
272
+
273
+ base_url = (
274
+ "https://api.perplexity.ai" if model_source == "perplexity"
275
+ else "https://router.huggingface.co/v1" if model_source == "huggingface"
276
+ else None
277
+ )
278
+
279
+ client = OpenAI(api_key=api_key, base_url=base_url)
280
+
219
281
  response_obj = client.chat.completions.create(
220
282
  model=user_model,
221
283
  messages=[{'role': 'user', 'content': second_prompt}],
222
- temperature=creativity
284
+ **({"temperature": creativity} if creativity is not None else {})
223
285
  )
224
- top_categories = response_obj.choices[0].message.content
286
+
287
+ top_categories = response_obj.choices[0].message.content
288
+
289
+ elif model_source == "anthropic":
290
+ import anthropic
291
+
292
+ client = anthropic.Anthropic(api_key=api_key)
293
+
294
+ response_obj = client.messages.create(
295
+ model=user_model,
296
+ max_tokens=4096,
297
+ messages=[{'role': 'user', 'content': second_prompt}],
298
+ **({"temperature": creativity} if creativity is not None else {})
299
+ )
300
+
301
+ top_categories = response_obj.content[0].text
302
+
225
303
  print(top_categories)
226
304
 
227
305
  top_categories_final = []
@@ -263,6 +341,10 @@ def multi_class(
263
341
  chain_of_thought = True,
264
342
  step_back_prompt = False,
265
343
  context_prompt = False,
344
+ top_n = 12,
345
+ cat_num = 10,
346
+ divisions = 10,
347
+ research_question = None,
266
348
  filename = "categorized_data.csv",
267
349
  save_directory = None,
268
350
  model_source = "auto"
@@ -273,6 +355,7 @@ def multi_class(
273
355
  import regex
274
356
  from tqdm import tqdm
275
357
 
358
+ #used in chain of verification
276
359
  def remove_numbering(line):
277
360
  line = line.strip()
278
361
 
@@ -321,16 +404,33 @@ def multi_class(
321
404
  raise ValueError(f"❌ Could not auto-detect model source from '{user_model}'. Please specify model_source explicitly: OpenAI, Anthropic, Perplexity, Google, Huggingface, or Mistral")
322
405
  else:
323
406
  model_source = model_source.lower()
324
-
407
+
408
+ if categories == "auto":
409
+ if survey_question == "": # step back requires the survey question to function well
410
+ raise TypeError("survey_question is required when using step_back_prompt. Please provide the survey question you are analyzing.")
411
+
412
+ categories = explore_common_categories(
413
+ survey_question=survey_question,
414
+ survey_input=survey_input,
415
+ research_question=research_question,
416
+ api_key=api_key,
417
+ top_n=top_n,
418
+ cat_num=cat_num,
419
+ divisions=divisions
420
+ )
421
+
325
422
  categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
326
423
  cat_num = len(categories)
327
424
  category_dict = {str(i+1): "0" for i in range(cat_num)}
328
425
  example_JSON = json.dumps(category_dict, indent=4)
329
426
 
330
- # ensure number of categories is what user wants
331
427
  print(f"\nThe categories you entered to be coded by {model_source} {user_model}:")
332
- for i, cat in enumerate(categories, 1):
333
- print(f"{i}. {cat}")
428
+
429
+ if categories != "auto":
430
+ # ensure number of categories is what user wants
431
+
432
+ for i, cat in enumerate(categories, 1):
433
+ print(f"{i}. {cat}")
334
434
 
335
435
  link1 = []
336
436
  extracted_jsons = []