cat-llm 0.0.74__py3-none-any.whl → 0.0.76__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/METADATA +1 -1
- {cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/RECORD +7 -7
- catllm/__about__.py +1 -1
- catllm/calls/all_calls.py +74 -0
- catllm/text_functions.py +128 -28
- {cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/WHEEL +0 -0
- {cat_llm-0.0.74.dist-info → cat_llm-0.0.76.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: cat-llm
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.76
|
4
4
|
Summary: A tool for categorizing text data and images using LLMs and vision models
|
5
5
|
Project-URL: Documentation, https://github.com/chrissoria/cat-llm#readme
|
6
6
|
Project-URL: Issues, https://github.com/chrissoria/cat-llm/issues
|
@@ -1,19 +1,19 @@
|
|
1
1
|
catllm/CERAD_functions.py,sha256=q4HbP5e2Yu8NnZZ-2eX4sImyj6u3i8xWcq0pYU81iis,22676
|
2
|
-
catllm/__about__.py,sha256=
|
2
|
+
catllm/__about__.py,sha256=yXzP4t-1ifCb-n2qXIRLt4j8v0AsNmsQgXS3fMChAzo,430
|
3
3
|
catllm/__init__.py,sha256=sf02zp7N0NW0mAQi7eQ4gliWR1EwoqvXkHN2HwwjcTE,372
|
4
4
|
catllm/build_web_research.py,sha256=880dfE2bEQb-FrXP-42JoLLtyc9ox_sBULDr38xiTiQ,22655
|
5
5
|
catllm/image_functions.py,sha256=8_FftRU285x1HT-AgNkaobefQVD-5q7ZY_t7JFdL3Sg,36177
|
6
6
|
catllm/model_reference_list.py,sha256=37pWwMcgnf4biE3BVRluH5oz2P6ccdJJiCVNHodBH8k,2307
|
7
|
-
catllm/text_functions.py,sha256=
|
7
|
+
catllm/text_functions.py,sha256=XF6aGuUyihnCKwGnGyLM1PbFQg3fF6nhJ_PoSX2zLaY,36101
|
8
8
|
catllm/calls/CoVe.py,sha256=Y9OGJbaeJ3Odwira92cPXUlnm_ADFqvpOSFSNjFzMMU,10847
|
9
9
|
catllm/calls/__init__.py,sha256=fWuMwLeSGa6zXJYd4s8IyNblsD62G-1NMUsOKrNIkoI,725
|
10
|
-
catllm/calls/all_calls.py,sha256=
|
10
|
+
catllm/calls/all_calls.py,sha256=AeN1QocOvL3Z36lDkq6bO0LB3ruz6pXyedvdci0YCxQ,16627
|
11
11
|
catllm/images/circle.png,sha256=JWujAWAh08-TajAoEr_TAeFNLlfbryOLw6cgIBREBuQ,86202
|
12
12
|
catllm/images/cube.png,sha256=nFec3e5bmRe4zrBCJ8QK-HcJLrG7u7dYdKhmdMfacfE,77275
|
13
13
|
catllm/images/diamond.png,sha256=rJDZKtsnBGRO8FPA0iHuA8FvHFGi9PkI_DWSFdw6iv0,99568
|
14
14
|
catllm/images/overlapping_pentagons.png,sha256=VO5plI6eoVRnjfqinn1nNzsCP2WQhuQy71V0EASouW4,71208
|
15
15
|
catllm/images/rectangles.png,sha256=2XM16HO9EYWj2yHgN4bPXaCwPfl7iYQy0tQUGaJX9xg,40692
|
16
|
-
cat_llm-0.0.
|
17
|
-
cat_llm-0.0.
|
18
|
-
cat_llm-0.0.
|
19
|
-
cat_llm-0.0.
|
16
|
+
cat_llm-0.0.76.dist-info/METADATA,sha256=EGlOhrerEtwgdk98DPhSCSshmOKnhXHw67-25V8wrJs,23214
|
17
|
+
cat_llm-0.0.76.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
18
|
+
cat_llm-0.0.76.dist-info/licenses/LICENSE,sha256=Vje2sS5WV4TnIwY5uQHrF4qnBAM3YOk1pGpdH0ot-2o,34969
|
19
|
+
cat_llm-0.0.76.dist-info/RECORD,,
|
catllm/__about__.py
CHANGED
catllm/calls/all_calls.py
CHANGED
@@ -431,3 +431,77 @@ def chain_of_verification_mistral(
|
|
431
431
|
print(f"ERROR in Chain of Verification: {str(e)}")
|
432
432
|
print("Falling back to initial response.\n")
|
433
433
|
return initial_reply
|
434
|
+
|
435
|
+
# openai explore corpus call
|
436
|
+
def get_openai_top_n(
|
437
|
+
prompt,
|
438
|
+
user_model,
|
439
|
+
specificity,
|
440
|
+
model_source,
|
441
|
+
api_key,
|
442
|
+
research_question,
|
443
|
+
creativity
|
444
|
+
):
|
445
|
+
"""
|
446
|
+
Get response from OpenAI API with system message.
|
447
|
+
"""
|
448
|
+
from openai import OpenAI
|
449
|
+
|
450
|
+
base_url = (
|
451
|
+
"https://api.perplexity.ai" if model_source == "perplexity"
|
452
|
+
else "https://router.huggingface.co/v1" if model_source == "huggingface"
|
453
|
+
else None
|
454
|
+
)
|
455
|
+
|
456
|
+
client = OpenAI(api_key=api_key, base_url=base_url)
|
457
|
+
|
458
|
+
response_obj = client.chat.completions.create(
|
459
|
+
model=user_model,
|
460
|
+
messages=[
|
461
|
+
{'role': 'system', 'content': f"""You are a helpful assistant that extracts categories from survey responses. \
|
462
|
+
The specific task is to identify {specificity} categories of responses to a survey question. \
|
463
|
+
The research question is: {research_question}""" if research_question else "You are a helpful assistant."},
|
464
|
+
{'role': 'user', 'content': prompt}
|
465
|
+
],
|
466
|
+
**({"temperature": creativity} if creativity is not None else {})
|
467
|
+
)
|
468
|
+
|
469
|
+
return response_obj.choices[0].message.content
|
470
|
+
|
471
|
+
# anthropic explore corpus call
|
472
|
+
def get_anthropic_top_n(
|
473
|
+
prompt,
|
474
|
+
user_model,
|
475
|
+
model_source,
|
476
|
+
specificity,
|
477
|
+
api_key,
|
478
|
+
research_question,
|
479
|
+
creativity
|
480
|
+
):
|
481
|
+
"""
|
482
|
+
Get response from Anthropic API with system prompt.
|
483
|
+
"""
|
484
|
+
import anthropic
|
485
|
+
client = anthropic.Anthropic(api_key=api_key)
|
486
|
+
|
487
|
+
# build system prompt
|
488
|
+
if research_question:
|
489
|
+
system_content = (f"You are a helpful assistant that extracts categories from survey responses. "
|
490
|
+
f"The specific task is to identify {specificity} categories of responses to a survey question. "
|
491
|
+
f"The research question is: {research_question}")
|
492
|
+
else:
|
493
|
+
system_content = "You are a helpful assistant."
|
494
|
+
|
495
|
+
response_obj = client.messages.create(
|
496
|
+
model=user_model,
|
497
|
+
max_tokens=4096,
|
498
|
+
system=system_content,
|
499
|
+
messages=[
|
500
|
+
{'role': 'user', 'content': prompt}
|
501
|
+
],
|
502
|
+
**({"temperature": creativity} if creativity is not None else {})
|
503
|
+
)
|
504
|
+
|
505
|
+
return response_obj.content[0].text
|
506
|
+
|
507
|
+
|
catllm/text_functions.py
CHANGED
@@ -6,7 +6,9 @@ from .calls.all_calls import (
|
|
6
6
|
chain_of_verification_openai,
|
7
7
|
chain_of_verification_google,
|
8
8
|
chain_of_verification_anthropic,
|
9
|
-
chain_of_verification_mistral
|
9
|
+
chain_of_verification_mistral,
|
10
|
+
get_openai_top_n,
|
11
|
+
get_anthropic_top_n,
|
10
12
|
)
|
11
13
|
|
12
14
|
|
@@ -117,9 +119,9 @@ def explore_common_categories(
|
|
117
119
|
survey_question,
|
118
120
|
survey_input,
|
119
121
|
api_key,
|
120
|
-
top_n=
|
122
|
+
top_n=12,
|
121
123
|
cat_num=10,
|
122
|
-
divisions=
|
124
|
+
divisions=10,
|
123
125
|
user_model="gpt-5",
|
124
126
|
creativity=None,
|
125
127
|
specificity="broad",
|
@@ -164,20 +166,19 @@ Responses are contained within triple backticks here: ```{survey_participant_chu
|
|
164
166
|
Number your categories from 1 through {cat_num} and be concise with the category labels and provide no description of the categories."""
|
165
167
|
|
166
168
|
if model_source == "openai":
|
167
|
-
client = OpenAI(api_key=api_key)
|
168
169
|
try:
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
**({"temperature": creativity} if creativity is not None else {})
|
170
|
+
reply = get_openai_top_n(
|
171
|
+
prompt=prompt,
|
172
|
+
user_model=user_model,
|
173
|
+
specificity=specificity,
|
174
|
+
api_key=api_key,
|
175
|
+
model_source=model_source,
|
176
|
+
research_question=research_question,
|
177
|
+
creativity=creativity
|
178
178
|
)
|
179
|
-
|
179
|
+
|
180
180
|
responses.append(reply)
|
181
|
+
|
181
182
|
except BadRequestError as e:
|
182
183
|
if "context_length_exceeded" in str(e) or "maximum context length" in str(e):
|
183
184
|
error_msg = (f"Token limit exceeded for model {user_model}. "
|
@@ -187,6 +188,20 @@ Number your categories from 1 through {cat_num} and be concise with the category
|
|
187
188
|
print(f"OpenAI API error: {e}")
|
188
189
|
except Exception as e:
|
189
190
|
print(f"An error occurred: {e}")
|
191
|
+
|
192
|
+
elif model_source == "anthropic":
|
193
|
+
|
194
|
+
reply = get_anthropic_top_n(
|
195
|
+
prompt=prompt,
|
196
|
+
user_model=user_model,
|
197
|
+
specificity=specificity,
|
198
|
+
model_source=model_source,
|
199
|
+
api_key=api_key,
|
200
|
+
research_question=research_question,
|
201
|
+
creativity=creativity
|
202
|
+
)
|
203
|
+
|
204
|
+
responses.append(reply)
|
190
205
|
else:
|
191
206
|
raise ValueError(f"Unsupported model_source: {model_source}")
|
192
207
|
|
@@ -204,24 +219,87 @@ Number your categories from 1 through {cat_num} and be concise with the category
|
|
204
219
|
flat_list = [item.lower() for sublist in responses_list for item in sublist]
|
205
220
|
|
206
221
|
#convert flat_list to a df
|
222
|
+
def normalize_category(cat):
|
223
|
+
if pd.isna(cat):
|
224
|
+
return cat
|
225
|
+
terms = sorted([term.strip().lower() for term in str(cat).split('/')])
|
226
|
+
return '/'.join(terms)
|
227
|
+
|
228
|
+
# normalized column
|
207
229
|
df = pd.DataFrame(flat_list, columns=['Category'])
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
230
|
+
df['normalized'] = df['Category'].apply(normalize_category)
|
231
|
+
|
232
|
+
# group by normalized, count, and keep most frequent original
|
233
|
+
result = (df.groupby('normalized')
|
234
|
+
.agg(Category=('Category', lambda x: x.value_counts().index[0]),
|
235
|
+
counts=('Category', 'size'))
|
236
|
+
.sort_values('counts', ascending=False)
|
237
|
+
.reset_index(drop=True))
|
238
|
+
|
239
|
+
df = result
|
240
|
+
|
241
|
+
second_prompt = f"""You are a data analyst reviewing categorized survey data.
|
242
|
+
|
243
|
+
Task: From the provided categories, identify and return the top {top_n} CONCEPTUALLY UNIQUE categories.
|
244
|
+
|
245
|
+
Critical Instructions:
|
246
|
+
1. The categories have already been deduplicated for exact string matches
|
247
|
+
2. However, some categories may still be SEMANTICALLY DUPLICATES (same concept, different wording):
|
248
|
+
- "closer to work" and "commute/proximity to work" mean the same thing
|
249
|
+
- "breakup/household conflict" and "relationship problems" mean the same thing
|
250
|
+
3. When you identify semantic duplicates:
|
251
|
+
- Combine their frequencies mentally
|
252
|
+
- Keep the version that appears most frequently OR is most clearly worded
|
253
|
+
- Each concept should appear ONLY ONCE in your final list
|
254
|
+
4. Keep category names {specificity}
|
255
|
+
5. Return ONLY a numbered list of {top_n} conceptually unique categories
|
256
|
+
6. No additional text, explanations, or commentary
|
257
|
+
|
258
|
+
Pre-processed Categories (sorted by frequency):
|
259
|
+
{df['Category'].head(top_n * 3).tolist()}
|
260
|
+
|
261
|
+
Note: More categories than needed are provided so you can identify and merge semantic duplicates.
|
262
|
+
|
263
|
+
Output Format:
|
264
|
+
1. category name
|
265
|
+
2. category name
|
266
|
+
3. category name
|
267
|
+
|
268
|
+
Top {top_n} Conceptually Unique Categories:"""
|
212
269
|
|
213
|
-
second_prompt = f"""From this list of categories, extract the top {top_n} most common categories. \
|
214
|
-
The categories are contained within triple backticks here: ```{df['Category'].tolist()}``` \
|
215
|
-
Return the top {top_n} categories as a numbered list sorted from the most to least common and keep the categories {specificity}, with no additional text or explanation."""
|
216
270
|
|
217
271
|
if model_source == "openai":
|
218
|
-
|
272
|
+
|
273
|
+
base_url = (
|
274
|
+
"https://api.perplexity.ai" if model_source == "perplexity"
|
275
|
+
else "https://router.huggingface.co/v1" if model_source == "huggingface"
|
276
|
+
else None
|
277
|
+
)
|
278
|
+
|
279
|
+
client = OpenAI(api_key=api_key, base_url=base_url)
|
280
|
+
|
219
281
|
response_obj = client.chat.completions.create(
|
220
282
|
model=user_model,
|
221
283
|
messages=[{'role': 'user', 'content': second_prompt}],
|
222
|
-
temperature
|
284
|
+
**({"temperature": creativity} if creativity is not None else {})
|
223
285
|
)
|
224
|
-
|
286
|
+
|
287
|
+
top_categories = response_obj.choices[0].message.content
|
288
|
+
|
289
|
+
elif model_source == "anthropic":
|
290
|
+
import anthropic
|
291
|
+
|
292
|
+
client = anthropic.Anthropic(api_key=api_key)
|
293
|
+
|
294
|
+
response_obj = client.messages.create(
|
295
|
+
model=user_model,
|
296
|
+
max_tokens=4096,
|
297
|
+
messages=[{'role': 'user', 'content': second_prompt}],
|
298
|
+
**({"temperature": creativity} if creativity is not None else {})
|
299
|
+
)
|
300
|
+
|
301
|
+
top_categories = response_obj.content[0].text
|
302
|
+
|
225
303
|
print(top_categories)
|
226
304
|
|
227
305
|
top_categories_final = []
|
@@ -263,6 +341,10 @@ def multi_class(
|
|
263
341
|
chain_of_thought = True,
|
264
342
|
step_back_prompt = False,
|
265
343
|
context_prompt = False,
|
344
|
+
top_n = 12,
|
345
|
+
cat_num = 10,
|
346
|
+
divisions = 10,
|
347
|
+
research_question = None,
|
266
348
|
filename = "categorized_data.csv",
|
267
349
|
save_directory = None,
|
268
350
|
model_source = "auto"
|
@@ -273,6 +355,7 @@ def multi_class(
|
|
273
355
|
import regex
|
274
356
|
from tqdm import tqdm
|
275
357
|
|
358
|
+
#used in chain of verification
|
276
359
|
def remove_numbering(line):
|
277
360
|
line = line.strip()
|
278
361
|
|
@@ -321,16 +404,33 @@ def multi_class(
|
|
321
404
|
raise ValueError(f"❌ Could not auto-detect model source from '{user_model}'. Please specify model_source explicitly: OpenAI, Anthropic, Perplexity, Google, Huggingface, or Mistral")
|
322
405
|
else:
|
323
406
|
model_source = model_source.lower()
|
324
|
-
|
407
|
+
|
408
|
+
if categories == "auto":
|
409
|
+
if survey_question == "": # step back requires the survey question to function well
|
410
|
+
raise TypeError("survey_question is required when using step_back_prompt. Please provide the survey question you are analyzing.")
|
411
|
+
|
412
|
+
categories = explore_common_categories(
|
413
|
+
survey_question=survey_question,
|
414
|
+
survey_input=survey_input,
|
415
|
+
research_question=research_question,
|
416
|
+
api_key=api_key,
|
417
|
+
top_n=top_n,
|
418
|
+
cat_num=cat_num,
|
419
|
+
divisions=divisions
|
420
|
+
)
|
421
|
+
|
325
422
|
categories_str = "\n".join(f"{i + 1}. {cat}" for i, cat in enumerate(categories))
|
326
423
|
cat_num = len(categories)
|
327
424
|
category_dict = {str(i+1): "0" for i in range(cat_num)}
|
328
425
|
example_JSON = json.dumps(category_dict, indent=4)
|
329
426
|
|
330
|
-
# ensure number of categories is what user wants
|
331
427
|
print(f"\nThe categories you entered to be coded by {model_source} {user_model}:")
|
332
|
-
|
333
|
-
|
428
|
+
|
429
|
+
if categories != "auto":
|
430
|
+
# ensure number of categories is what user wants
|
431
|
+
|
432
|
+
for i, cat in enumerate(categories, 1):
|
433
|
+
print(f"{i}. {cat}")
|
334
434
|
|
335
435
|
link1 = []
|
336
436
|
extracted_jsons = []
|
File without changes
|
File without changes
|