cat-llm 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_llm-0.0.52.dist-info → cat_llm-0.0.54.dist-info}/METADATA +59 -2
- {cat_llm-0.0.52.dist-info → cat_llm-0.0.54.dist-info}/RECORD +7 -7
- catllm/__about__.py +1 -1
- catllm/build_web_research.py +2 -2
- catllm/text_functions.py +31 -0
- {cat_llm-0.0.52.dist-info → cat_llm-0.0.54.dist-info}/WHEEL +0 -0
- {cat_llm-0.0.52.dist-info → cat_llm-0.0.54.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cat-llm
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.54
|
|
4
4
|
Summary: A tool for categorizing text data and images using LLMs and vision models
|
|
5
5
|
Project-URL: Documentation, https://github.com/chrissoria/cat-llm#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/chrissoria/cat-llm/issues
|
|
7
7
|
Project-URL: Source, https://github.com/chrissoria/cat-llm
|
|
8
|
-
Author-email:
|
|
8
|
+
Author-email: Chris Soria <chrissoria@berkeley.edu>
|
|
9
9
|
License-Expression: MIT
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: categorizer,image classification,llm,structured output,survey data,text classification
|
|
@@ -19,7 +19,9 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
19
19
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
20
20
|
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
21
21
|
Requires-Python: >=3.8
|
|
22
|
+
Requires-Dist: openai
|
|
22
23
|
Requires-Dist: pandas
|
|
24
|
+
Requires-Dist: requests
|
|
23
25
|
Requires-Dist: tqdm
|
|
24
26
|
Description-Content-Type: text/markdown
|
|
25
27
|
|
|
@@ -44,6 +46,7 @@ Description-Content-Type: text/markdown
|
|
|
44
46
|
- [multi_class()](#multi_class)
|
|
45
47
|
- [image_score()](#image_score)
|
|
46
48
|
- [image_features()](#image_features)
|
|
49
|
+
- [build_web_research_dataset()](#build_web_research_dataset)
|
|
47
50
|
- [cerad_drawn_score()](#cerad_drawn_score)
|
|
48
51
|
- [Academic Research](#academic-research)
|
|
49
52
|
- [License](#license)
|
|
@@ -344,6 +347,60 @@ image_scores = cat.image_features(
|
|
|
344
347
|
api_key="OPENAI_API_KEY")
|
|
345
348
|
```
|
|
346
349
|
|
|
350
|
+
### `build_web_research_dataset()`
|
|
351
|
+
|
|
352
|
+
Conducts automated web research on specified topics and compiles the findings into a structured dataset, extracting answers and source URLs for comprehensive research workflows.
|
|
353
|
+
|
|
354
|
+
NOTE: This function currently only works with Anthropic models and requires an Anthropic API key. It is strongly recommended to increase your API rate limits before using this function to avoid interruptions during web research tasks.
|
|
355
|
+
|
|
356
|
+
SECOND NOTE: This function works best if you are specific with your search question. For example, instead of search_question="Hottest temperature in 2024?" you should use "Hottest temperature in 2024 from extremeweatherwatch.com?" or "Hottest temperature in 2024 from weatherundeground.com?". Another example is use "Where these UC Berkeley professors got their PhD according to Linkedin?" instead of "Where they got their PhD according to Linkedin?" to avoid matching people with the same name.
|
|
357
|
+
|
|
358
|
+
THIRD NOTE: This function works by scraping data from the web. Be aware that not all websites allow webscraping from Anthropic and therefore the function won't be able to retrieve information from these sites.
|
|
359
|
+
|
|
360
|
+
**Methodology:**
|
|
361
|
+
Performs systematic web searches using the specified search questions and processes the results through Anthropic's language models to extract relevant information. The function handles multiple search queries sequentially, applying time delays between requests to respect rate limits. Results are categorized according to user-defined criteria and can be exported to CSV format for further analysis and research documentation.
|
|
362
|
+
|
|
363
|
+
**Rate Limits:**
|
|
364
|
+
Before using this function, review and increase your Anthropic API rate limits at: https://console.anthropic.com/settings/limits. For general information about API rate limits, consult the Anthropic documentation at: https://docs.anthropic.com/claude/reference/rate-limits
|
|
365
|
+
|
|
366
|
+
**Parameters:**
|
|
367
|
+
- `search_question` (str): Primary research question or topic to guide the search strategy
|
|
368
|
+
- `search_input` (list): List of specific search queries or questions to investigate
|
|
369
|
+
- `features_to_extract` (list): List of specific features to extract (e.g., ["number of people", "primary color", "contains text"])
|
|
370
|
+
- `api_key` (str): API key for the LLM service
|
|
371
|
+
- `answer_format`: (str, default="concise"): Response detail level ("concise", "detailed", "comprehensive")
|
|
372
|
+
- `additional_instructions` (str, default="claude-3-7-sonnet-20250219"): Specific Anthropic model to use for processing results
|
|
373
|
+
- `user_model` (str, default="gpt-4o"): Specific vision model to use
|
|
374
|
+
- `creativity` (float, default=0): Temperature/randomness setting (0.0-1.0)
|
|
375
|
+
- `safety` (bool, default=False): Enable safety checks and save results at each API call step
|
|
376
|
+
- `filename` (str, default="categorized_data.csv"): Filename for CSV output
|
|
377
|
+
- `save_directory` (str, optional): Directory path to save the CSV file
|
|
378
|
+
- `model_source` (str, default="OpenAI"): Model provider ("OpenAI", "Anthropic", "Perplexity", "Mistral")
|
|
379
|
+
- `time_delay` (int, default=15): Delay in seconds between search requests to manage API rate limits
|
|
380
|
+
|
|
381
|
+
**Returns:**
|
|
382
|
+
- `pandas.DataFrame`: DataFrame with image paths and extracted feature values for each specified attribute[1][4]
|
|
383
|
+
|
|
384
|
+
**Example:**
|
|
385
|
+
|
|
386
|
+
```
|
|
387
|
+
import catllm as cat
|
|
388
|
+
|
|
389
|
+
research_data = cat.build_web_research_dataset(
|
|
390
|
+
search_question="What are the latest developments in renewable energy technology?",
|
|
391
|
+
search_input=["solar panel efficiency 2025", "wind turbine innovations", "battery storage breakthroughs"],
|
|
392
|
+
api_key="ANTHROPIC_API_KEY",
|
|
393
|
+
answer_format="detailed",
|
|
394
|
+
additional_instructions="Focus on recent technological advances and commercial applications",
|
|
395
|
+
categories=['Answer', 'URL', 'Date', 'Key_Technology'],
|
|
396
|
+
model_source="Anthropic",
|
|
397
|
+
user_model="claude-3-7-sonnet-20250219",
|
|
398
|
+
creativity=0.1,
|
|
399
|
+
safety=True,
|
|
400
|
+
time_delay=3
|
|
401
|
+
)
|
|
402
|
+
```
|
|
403
|
+
|
|
347
404
|
### `cerad_drawn_score()`
|
|
348
405
|
|
|
349
406
|
Automatically scores drawings of circles, diamonds, overlapping rectangles, and cubes according to the official Consortium to Establish a Registry for Alzheimer's Disease (CERAD) scoring system, returning structured results with optional CSV export. Works even with images that contain other drawings or writing.
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
catllm/CERAD_functions.py,sha256=NNEu_Q10tClV7vRIVEgSQY8ujlXDbpWDzo1AbqlN7nQ,22462
|
|
2
|
-
catllm/__about__.py,sha256=
|
|
2
|
+
catllm/__about__.py,sha256=aaUp77jCD5GfUcnWYE3cYhgAzuymMUxVR6eleHYJIYA,404
|
|
3
3
|
catllm/__init__.py,sha256=sf02zp7N0NW0mAQi7eQ4gliWR1EwoqvXkHN2HwwjcTE,372
|
|
4
|
-
catllm/build_web_research.py,sha256=
|
|
4
|
+
catllm/build_web_research.py,sha256=CYGhxnonJLBw80ATEBkpRjOKJgCYntHTgx4s4Pb8g88,6833
|
|
5
5
|
catllm/image_functions.py,sha256=Gz-djnXVaLT8GOR0sc8aPjjuC9L_gIT2AjUMjsjjmi0,35492
|
|
6
|
-
catllm/text_functions.py,sha256=
|
|
6
|
+
catllm/text_functions.py,sha256=bRkzBAg5G86466pF09gsR_zi0gJy81iKdvmjL5CvDOw,17911
|
|
7
7
|
catllm/images/circle.png,sha256=JWujAWAh08-TajAoEr_TAeFNLlfbryOLw6cgIBREBuQ,86202
|
|
8
8
|
catllm/images/cube.png,sha256=nFec3e5bmRe4zrBCJ8QK-HcJLrG7u7dYdKhmdMfacfE,77275
|
|
9
9
|
catllm/images/diamond.png,sha256=rJDZKtsnBGRO8FPA0iHuA8FvHFGi9PkI_DWSFdw6iv0,99568
|
|
10
10
|
catllm/images/overlapping_pentagons.png,sha256=VO5plI6eoVRnjfqinn1nNzsCP2WQhuQy71V0EASouW4,71208
|
|
11
11
|
catllm/images/rectangles.png,sha256=2XM16HO9EYWj2yHgN4bPXaCwPfl7iYQy0tQUGaJX9xg,40692
|
|
12
|
-
cat_llm-0.0.
|
|
13
|
-
cat_llm-0.0.
|
|
14
|
-
cat_llm-0.0.
|
|
15
|
-
cat_llm-0.0.
|
|
12
|
+
cat_llm-0.0.54.dist-info/METADATA,sha256=XoNc8RZEAq1xnKxsbKpXiDnBaKf73C5JZIjlB8ldVtA,21499
|
|
13
|
+
cat_llm-0.0.54.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
14
|
+
cat_llm-0.0.54.dist-info/licenses/LICENSE,sha256=Vje2sS5WV4TnIwY5uQHrF4qnBAM3YOk1pGpdH0ot-2o,34969
|
|
15
|
+
cat_llm-0.0.54.dist-info/RECORD,,
|
catllm/__about__.py
CHANGED
catllm/build_web_research.py
CHANGED
|
@@ -6,13 +6,13 @@ def build_web_research_dataset(
|
|
|
6
6
|
answer_format = "concise",
|
|
7
7
|
additional_instructions = "",
|
|
8
8
|
categories = ['Answer','URL'],
|
|
9
|
-
user_model="claude-
|
|
9
|
+
user_model="claude-sonnet-4-20250514",
|
|
10
10
|
creativity=0,
|
|
11
11
|
safety=False,
|
|
12
12
|
filename="categorized_data.csv",
|
|
13
13
|
save_directory=None,
|
|
14
14
|
model_source="Anthropic",
|
|
15
|
-
time_delay=
|
|
15
|
+
time_delay=15
|
|
16
16
|
):
|
|
17
17
|
import os
|
|
18
18
|
import json
|
catllm/text_functions.py
CHANGED
|
@@ -307,6 +307,37 @@ Provide your work in JSON format where the number belonging to each category is
|
|
|
307
307
|
except Exception as e:
|
|
308
308
|
print(f"An error occurred: {e}")
|
|
309
309
|
link1.append(f"Error processing input: {e}")
|
|
310
|
+
|
|
311
|
+
elif model_source == "Google":
|
|
312
|
+
import requests
|
|
313
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{user_model}:generateContent"
|
|
314
|
+
try:
|
|
315
|
+
headers = {
|
|
316
|
+
"x-goog-api-key": api_key,
|
|
317
|
+
"Content-Type": "application/json"
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
payload = {
|
|
321
|
+
"contents": [{
|
|
322
|
+
"parts": [{"text": prompt}]
|
|
323
|
+
}]
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
response = requests.post(url, headers=headers, json=payload)
|
|
327
|
+
response.raise_for_status() # Raise exception for HTTP errors
|
|
328
|
+
result = response.json()
|
|
329
|
+
|
|
330
|
+
if "candidates" in result and result["candidates"]:
|
|
331
|
+
reply = result["candidates"][0]["content"]["parts"][0]["text"]
|
|
332
|
+
else:
|
|
333
|
+
reply = "No response generated"
|
|
334
|
+
|
|
335
|
+
link1.append(reply)
|
|
336
|
+
print(reply)
|
|
337
|
+
except Exception as e:
|
|
338
|
+
print(f"An error occurred: {e}")
|
|
339
|
+
link1.append(f"Error processing input: {e}")
|
|
340
|
+
|
|
310
341
|
elif model_source == "Mistral":
|
|
311
342
|
from mistralai import Mistral
|
|
312
343
|
client = Mistral(api_key=api_key)
|
|
File without changes
|
|
File without changes
|