cat-stack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cat_stack/__about__.py +10 -0
- cat_stack/__init__.py +128 -0
- cat_stack/_batch.py +1388 -0
- cat_stack/_category_analysis.py +348 -0
- cat_stack/_chunked.py +424 -0
- cat_stack/_embeddings.py +189 -0
- cat_stack/_formatter.py +169 -0
- cat_stack/_providers.py +1048 -0
- cat_stack/_tiebreaker.py +277 -0
- cat_stack/_utils.py +512 -0
- cat_stack/_web_fetch.py +194 -0
- cat_stack/calls/CoVe.py +287 -0
- cat_stack/calls/__init__.py +25 -0
- cat_stack/calls/all_calls.py +622 -0
- cat_stack/calls/image_CoVe.py +386 -0
- cat_stack/calls/image_stepback.py +210 -0
- cat_stack/calls/pdf_CoVe.py +386 -0
- cat_stack/calls/pdf_stepback.py +210 -0
- cat_stack/calls/stepback.py +180 -0
- cat_stack/calls/top_n.py +217 -0
- cat_stack/classify.py +682 -0
- cat_stack/explore.py +111 -0
- cat_stack/extract.py +218 -0
- cat_stack/image_functions.py +2078 -0
- cat_stack/images/circle.png +0 -0
- cat_stack/images/cube.png +0 -0
- cat_stack/images/diamond.png +0 -0
- cat_stack/images/overlapping_pentagons.png +0 -0
- cat_stack/images/rectangles.png +0 -0
- cat_stack/model_reference_list.py +94 -0
- cat_stack/pdf_functions.py +2087 -0
- cat_stack/summarize.py +290 -0
- cat_stack/text_functions.py +1358 -0
- cat_stack/text_functions_ensemble.py +3644 -0
- cat_stack-0.1.0.dist-info/METADATA +150 -0
- cat_stack-0.1.0.dist-info/RECORD +38 -0
- cat_stack-0.1.0.dist-info/WHEEL +4 -0
- cat_stack-0.1.0.dist-info/licenses/LICENSE +672 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cat-stack
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Domain-agnostic text, image, and PDF classification engine powered by LLMs
|
|
5
|
+
Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
|
|
6
|
+
Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
|
|
7
|
+
Project-URL: Source, https://github.com/chrissoria/cat-stack
|
|
8
|
+
Author-email: Chris Soria <chrissoria@berkeley.edu>
|
|
9
|
+
License-Expression: GPL-3.0-or-later
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ensemble,image classification,llm,pdf classification,structured output,text classification
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Programming Language :: Python
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
20
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Requires-Dist: anthropic
|
|
23
|
+
Requires-Dist: openai
|
|
24
|
+
Requires-Dist: pandas
|
|
25
|
+
Requires-Dist: perplexityai
|
|
26
|
+
Requires-Dist: requests
|
|
27
|
+
Requires-Dist: tqdm
|
|
28
|
+
Provides-Extra: embeddings
|
|
29
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == 'embeddings'
|
|
30
|
+
Provides-Extra: formatter
|
|
31
|
+
Requires-Dist: accelerate>=0.27.0; extra == 'formatter'
|
|
32
|
+
Requires-Dist: torch>=2.0.0; extra == 'formatter'
|
|
33
|
+
Requires-Dist: transformers>=4.40.0; extra == 'formatter'
|
|
34
|
+
Provides-Extra: pdf
|
|
35
|
+
Requires-Dist: pymupdf>=1.23.0; extra == 'pdf'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# cat-stack
|
|
39
|
+
|
|
40
|
+
**Domain-agnostic text, image, and PDF classification engine powered by LLMs.**
|
|
41
|
+
|
|
42
|
+
`cat-stack` is the shared base package for the [CatLLM](https://github.com/chrissoria/cat-llm) ecosystem. It provides the core classification, extraction, exploration, and summarization engine that all domain-specific CatLLM packages build on.
|
|
43
|
+
|
|
44
|
+
## Installation
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install cat-stack
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Optional extras:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install cat-stack[pdf] # PDF support (PyMuPDF)
|
|
54
|
+
pip install cat-stack[embeddings] # Embedding similarity scoring
|
|
55
|
+
pip install cat-stack[formatter] # JSON formatter fallback model
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Ecosystem
|
|
59
|
+
|
|
60
|
+
`cat-stack` is independently useful for classifying any text column. Domain-specific packages extend it with tuned prompts and workflows:
|
|
61
|
+
|
|
62
|
+
| Package | Domain |
|
|
63
|
+
|---------|--------|
|
|
64
|
+
| **cat-stack** | General-purpose text, image, PDF classification (this package) |
|
|
65
|
+
| **cat-survey** | Survey response classification |
|
|
66
|
+
| **cat-vader** | Social media text (Reddit, Twitter/X) |
|
|
67
|
+
| **cat-ademic** | Academic papers, PDFs, citations |
|
|
68
|
+
| **cat-cog** | Cognitive assessment & visual scoring (CERAD) |
|
|
69
|
+
| **cat-pol** | Political text (manifestos, speeches, legislation) |
|
|
70
|
+
|
|
71
|
+
Installing `cat-llm` pulls in all of the above.
|
|
72
|
+
|
|
73
|
+
## Quick Start
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import cat_stack as cat
|
|
77
|
+
|
|
78
|
+
# Classify text into predefined categories
|
|
79
|
+
result = cat.classify(
|
|
80
|
+
input_data=df["text_column"],
|
|
81
|
+
categories=["Positive", "Negative", "Neutral"],
|
|
82
|
+
models=[("gpt-4o", "openai", OPENAI_KEY)],
|
|
83
|
+
filename="classified.csv"
|
|
84
|
+
)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Core API
|
|
88
|
+
|
|
89
|
+
### `classify()`
|
|
90
|
+
Assign predefined categories to text, images, or PDFs. Supports single-model and multi-model ensemble classification with consensus voting.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
cat.classify(
|
|
94
|
+
input_data=df["text"],
|
|
95
|
+
categories=["Cat A", "Cat B", "Cat C"],
|
|
96
|
+
models=[("gpt-4o", "openai", key1), ("claude-sonnet-4-20250514", "anthropic", key2)],
|
|
97
|
+
filename="results.csv"
|
|
98
|
+
)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### `extract()`
|
|
102
|
+
Discover categories from a corpus using LLM-driven exploration.
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
cat.extract(
|
|
106
|
+
input_data=df["text"],
|
|
107
|
+
survey_question="What is this text about?",
|
|
108
|
+
models=[("gpt-4o", "openai", key)],
|
|
109
|
+
)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### `explore()`
|
|
113
|
+
Raw category extraction for saturation analysis.
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
cat.explore(
|
|
117
|
+
input_data=df["text"],
|
|
118
|
+
description="Describe the main themes",
|
|
119
|
+
models=[("gpt-4o", "openai", key)],
|
|
120
|
+
)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### `summarize()`
|
|
124
|
+
Summarize text or PDF documents, with optional multi-model ensemble.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
cat.summarize(
|
|
128
|
+
input_data=df["text"],
|
|
129
|
+
models=[("gpt-4o", "openai", key)],
|
|
130
|
+
filename="summaries.csv"
|
|
131
|
+
)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Supported Providers
|
|
135
|
+
|
|
136
|
+
OpenAI, Anthropic, Google (Gemini), Mistral, Perplexity, xAI (Grok), HuggingFace, Ollama (local models).
|
|
137
|
+
|
|
138
|
+
All providers use the same `(model_name, provider, api_key)` tuple format. Provider is auto-detected from model name if omitted.
|
|
139
|
+
|
|
140
|
+
## Features
|
|
141
|
+
|
|
142
|
+
- **Multi-model ensemble** with consensus voting and agreement scores
|
|
143
|
+
- **Batch API support** for OpenAI, Anthropic, Google, Mistral, and xAI
|
|
144
|
+
- **Prompt strategies**: Chain-of-Thought, Chain-of-Verification, step-back prompting, few-shot examples
|
|
145
|
+
- **Text, image, and PDF** input auto-detection
|
|
146
|
+
- **Embedding similarity** tiebreaker for ensemble consensus ties
|
|
147
|
+
|
|
148
|
+
## License
|
|
149
|
+
|
|
150
|
+
GPL-3.0-or-later
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
cat_stack/__about__.py,sha256=2yQWde3xin26jZeTfxWT9NpZPTNh75T7fpCjxPB-Jt8,411
|
|
2
|
+
cat_stack/__init__.py,sha256=WeM8X2AGCbhqumWgfliDVlUguB_8TCuOtEffgeTp7Yc,3542
|
|
3
|
+
cat_stack/_batch.py,sha256=2oJs1CcEWgVpP09TK-yJnEEwKq7lgHeXHI3HlBKJQUc,52703
|
|
4
|
+
cat_stack/_category_analysis.py,sha256=LYp40Pw1-0uT9cruhnn6kpjxr23xm4XItHpl4hZwyCw,12692
|
|
5
|
+
cat_stack/_chunked.py,sha256=1ZYb_G1anMnbtGV5zSfBTgIRlk78O1Viwj4bAmXDcOU,15356
|
|
6
|
+
cat_stack/_embeddings.py,sha256=FQYQYwG2cQ8kraLLwjysFBk-2S5f0dZylZ9HWGnGEwo,6659
|
|
7
|
+
cat_stack/_formatter.py,sha256=VgYn26zPVknPjEYE2iHEiygwol6iDrgS9kq4xgBMZ3A,5310
|
|
8
|
+
cat_stack/_providers.py,sha256=Ek9G7nW6pgAjgEy87jcLFIsc8tBlyfZ_gjiSdhVdI4Y,39047
|
|
9
|
+
cat_stack/_tiebreaker.py,sha256=gYP7hGbQcZ9uOPHIhzuCgFi46Kwu7ViAH0c3QlB-w1Y,9770
|
|
10
|
+
cat_stack/_utils.py,sha256=YX8UX9Z8VAAeN7BO1RClgg8XK_YelvO1RG-Yb5tND4c,15712
|
|
11
|
+
cat_stack/_web_fetch.py,sha256=zr86MXwq6jOuTmEJCLiKzbNtW36IfBaSp956lx3jkp0,5441
|
|
12
|
+
cat_stack/classify.py,sha256=5GkYX8R3yezBkKeSXwF9K1EoLRIRFCPkG1v_u-aRnSM,31951
|
|
13
|
+
cat_stack/explore.py,sha256=_LkdRkSIIXx0ixZbXGEfiA9irD96HtJC3dTZ7tQ_2MA,3943
|
|
14
|
+
cat_stack/extract.py,sha256=PmFlrGaXOPOfonXOkl4WHzFAdmTOLva-7ZLpEaBSC34,8047
|
|
15
|
+
cat_stack/image_functions.py,sha256=2O2vF7VEaeUesCKLXI-YHWqdCcBcgnX4b3LUegwS8bQ,89027
|
|
16
|
+
cat_stack/model_reference_list.py,sha256=37pWwMcgnf4biE3BVRluH5oz2P6ccdJJiCVNHodBH8k,2307
|
|
17
|
+
cat_stack/pdf_functions.py,sha256=KrEmpL_blY4J0VIr9W-4xFZTQjKCFsj1aWlC9Ckw5wM,88692
|
|
18
|
+
cat_stack/summarize.py,sha256=aRBXU95hnlhcx4I0sjvLUa9yV5n6K3tnYQQHh-0FAmU,11462
|
|
19
|
+
cat_stack/text_functions.py,sha256=lDv7dn8z3FoEoAzuZa5CiVWT2gP2yidXWHfHHyoRgKI,50932
|
|
20
|
+
cat_stack/text_functions_ensemble.py,sha256=GIVtfAgAefTTOTCSzhP0nAHcO6wFzRc9uNmpX_I318U,141882
|
|
21
|
+
cat_stack/calls/CoVe.py,sha256=92h-HATm_Sa9QT0nmuyURpkzm4wemSCwFhOZfBD_v_A,9819
|
|
22
|
+
cat_stack/calls/__init__.py,sha256=j5n25HE7_E4imYgM8VcjP5FSRGUvASq4Uq0AghL2-2E,738
|
|
23
|
+
cat_stack/calls/all_calls.py,sha256=SFqnbHZi8mjz2IUteSEsyz7z1DrTUTwNH2xlf64X4d8,20318
|
|
24
|
+
cat_stack/calls/image_CoVe.py,sha256=d6l0cvZFRWt2b669A9_voNu0ofxiF00l9CP9pA0Uzto,12827
|
|
25
|
+
cat_stack/calls/image_stepback.py,sha256=Wr_BIco0y0ReK93vJa4iiMZKtVT8NO1-B_irZn2Gyfc,5950
|
|
26
|
+
cat_stack/calls/pdf_CoVe.py,sha256=3hvq-W-We6OZeKBygF6n0VVqlRJxwF0coWZx40no0c4,12934
|
|
27
|
+
cat_stack/calls/pdf_stepback.py,sha256=UYCwVH074S6n6FqfpCCRVoHEJ-Y7DED9FN6gTWen8QI,5925
|
|
28
|
+
cat_stack/calls/stepback.py,sha256=0VIRreH3q1cQ2HPrgGm13A_n0gcCMfzBJculuvAOQuQ,4754
|
|
29
|
+
cat_stack/calls/top_n.py,sha256=mdPFLg3mJg9B3NjngR5aQjxM__B1br7uC-pSJtvWYXM,6283
|
|
30
|
+
cat_stack/images/circle.png,sha256=JWujAWAh08-TajAoEr_TAeFNLlfbryOLw6cgIBREBuQ,86202
|
|
31
|
+
cat_stack/images/cube.png,sha256=nFec3e5bmRe4zrBCJ8QK-HcJLrG7u7dYdKhmdMfacfE,77275
|
|
32
|
+
cat_stack/images/diamond.png,sha256=rJDZKtsnBGRO8FPA0iHuA8FvHFGi9PkI_DWSFdw6iv0,99568
|
|
33
|
+
cat_stack/images/overlapping_pentagons.png,sha256=VO5plI6eoVRnjfqinn1nNzsCP2WQhuQy71V0EASouW4,71208
|
|
34
|
+
cat_stack/images/rectangles.png,sha256=2XM16HO9EYWj2yHgN4bPXaCwPfl7iYQy0tQUGaJX9xg,40692
|
|
35
|
+
cat_stack-0.1.0.dist-info/METADATA,sha256=yYd_6IYCIj05YehBPugI208ENDTWnjqgyr3y3VWM-Jg,4818
|
|
36
|
+
cat_stack-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
37
|
+
cat_stack-0.1.0.dist-info/licenses/LICENSE,sha256=Vje2sS5WV4TnIwY5uQHrF4qnBAM3YOk1pGpdH0ot-2o,34969
|
|
38
|
+
cat_stack-0.1.0.dist-info/RECORD,,
|