cat-stack 1.0.18__tar.gz → 1.0.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_stack-1.0.18 → cat_stack-1.0.22}/PKG-INFO +2 -2
- {cat_stack-1.0.18 → cat_stack-1.0.22}/README.md +1 -1
- {cat_stack-1.0.18 → cat_stack-1.0.22}/pyproject.toml +12 -10
- cat_stack-1.0.22/src/cat_stack/__init__.py +18 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/__about__.py +1 -1
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_formatter.py +62 -14
- cat_stack-1.0.22/src/catstack/_prompts.py +205 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_utils.py +3 -3
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/classify.py +39 -7
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/explore.py +2 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/extract.py +2 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/text_functions.py +23 -25
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/text_functions_ensemble.py +17 -0
- {cat_stack-1.0.18 → cat_stack-1.0.22}/.gitignore +0 -0
- {cat_stack-1.0.18 → cat_stack-1.0.22}/LICENSE +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/__init__.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_batch.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_category_analysis.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_chunked.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_embeddings.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_pilot_test.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_providers.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_review_ui.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_tiebreaker.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_web_fetch.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/CoVe.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/__init__.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/all_calls.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/image_CoVe.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/image_stepback.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/pdf_CoVe.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/pdf_stepback.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/stepback.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/top_n.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/image_functions.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/circle.png +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/cube.png +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/diamond.png +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/overlapping_pentagons.png +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/rectangles.png +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/model_reference_list.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/pdf_functions.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/prompt_tune.py +0 -0
- {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/summarize.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cat-stack
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.22
|
|
4
4
|
Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
|
|
5
5
|
Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
|
|
@@ -73,7 +73,7 @@ Installing `cat-llm` pulls in all of the above.
|
|
|
73
73
|
## Quick Start
|
|
74
74
|
|
|
75
75
|
```python
|
|
76
|
-
import
|
|
76
|
+
import catstack as cat
|
|
77
77
|
|
|
78
78
|
# Classify text into predefined categories
|
|
79
79
|
result = cat.classify(
|
|
@@ -43,38 +43,40 @@ Issues = "https://github.com/chrissoria/cat-stack/issues"
|
|
|
43
43
|
Source = "https://github.com/chrissoria/cat-stack"
|
|
44
44
|
|
|
45
45
|
[tool.hatch.version]
|
|
46
|
-
path = "src/
|
|
46
|
+
path = "src/catstack/__about__.py"
|
|
47
47
|
|
|
48
48
|
[tool.hatch.envs.types]
|
|
49
49
|
extra-dependencies = [
|
|
50
50
|
"mypy>=1.0.0",
|
|
51
51
|
]
|
|
52
52
|
[tool.hatch.envs.types.scripts]
|
|
53
|
-
check = "mypy --install-types --non-interactive {args:src/
|
|
53
|
+
check = "mypy --install-types --non-interactive {args:src/catstack tests}"
|
|
54
54
|
|
|
55
55
|
[tool.hatch.build.targets.wheel]
|
|
56
|
-
packages = ["src/cat_stack"]
|
|
56
|
+
packages = ["src/catstack", "src/cat_stack"]
|
|
57
57
|
include = [
|
|
58
|
-
"src/
|
|
59
|
-
"src/
|
|
58
|
+
"src/catstack/**/*.py",
|
|
59
|
+
"src/catstack/images/*",
|
|
60
|
+
"src/cat_stack/__init__.py",
|
|
60
61
|
]
|
|
61
62
|
|
|
62
63
|
[tool.hatch.build.targets.sdist]
|
|
63
64
|
include = [
|
|
64
|
-
"src/
|
|
65
|
-
"src/
|
|
65
|
+
"src/catstack/**/*.py",
|
|
66
|
+
"src/catstack/images/*",
|
|
67
|
+
"src/cat_stack/__init__.py",
|
|
66
68
|
]
|
|
67
69
|
|
|
68
70
|
[tool.coverage.run]
|
|
69
|
-
source_pkgs = ["
|
|
71
|
+
source_pkgs = ["catstack", "tests"]
|
|
70
72
|
branch = true
|
|
71
73
|
parallel = true
|
|
72
74
|
omit = [
|
|
73
|
-
"src/
|
|
75
|
+
"src/catstack/__about__.py",
|
|
74
76
|
]
|
|
75
77
|
|
|
76
78
|
[tool.coverage.paths]
|
|
77
|
-
|
|
79
|
+
catstack = ["src/catstack", "*/cat-stack/src/catstack"]
|
|
78
80
|
tests = ["tests", "*/cat-stack/tests"]
|
|
79
81
|
|
|
80
82
|
[tool.coverage.report]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Back-compat alias for `catstack`.
|
|
2
|
+
|
|
3
|
+
The canonical import name is `catstack`. `cat_stack` is retained so existing
|
|
4
|
+
code continues to work; prefer `catstack` in new code.
|
|
5
|
+
"""
|
|
6
|
+
import importlib
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
_canonical = "catstack"
|
|
10
|
+
_real = importlib.import_module(_canonical)
|
|
11
|
+
|
|
12
|
+
sys.modules[__name__] = _real
|
|
13
|
+
|
|
14
|
+
_src_prefix = _canonical + "."
|
|
15
|
+
_dst_prefix = __name__ + "."
|
|
16
|
+
for _name in list(sys.modules):
|
|
17
|
+
if _name.startswith(_src_prefix):
|
|
18
|
+
sys.modules[_dst_prefix + _name[len(_src_prefix):]] = sys.modules[_name]
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
-
__version__ = "1.0.
|
|
4
|
+
__version__ = "1.0.22"
|
|
5
5
|
__author__ = "Chris Soria"
|
|
6
6
|
__email__ = "chrissoria@berkeley.edu"
|
|
7
7
|
__title__ = "cat-stack"
|
|
@@ -42,6 +42,56 @@ def _check_dependencies():
|
|
|
42
42
|
)
|
|
43
43
|
|
|
44
44
|
|
|
45
|
+
def _ensure_dependencies(verbose: bool = True) -> bool:
|
|
46
|
+
"""Ensure formatter Python dependencies are installed.
|
|
47
|
+
|
|
48
|
+
Tries to import torch/transformers/accelerate. If any are missing,
|
|
49
|
+
auto-installs them via pip after printing a clear warning about the
|
|
50
|
+
download size (~1.5 GB total). Returns True on success, False on
|
|
51
|
+
install failure.
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
import torch # noqa: F401
|
|
55
|
+
import transformers # noqa: F401
|
|
56
|
+
import accelerate # noqa: F401
|
|
57
|
+
return True
|
|
58
|
+
except ImportError:
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
if verbose:
|
|
62
|
+
print(
|
|
63
|
+
"\n[CatLLM] JSON formatter dependencies (transformers, torch, "
|
|
64
|
+
"accelerate)\n"
|
|
65
|
+
" are not installed in this Python environment. Installing now\n"
|
|
66
|
+
" (~1.5 GB download; one-time). To skip this and disable the\n"
|
|
67
|
+
" formatter, pass json_formatter=False."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
import subprocess
|
|
71
|
+
try:
|
|
72
|
+
subprocess.check_call(
|
|
73
|
+
[sys.executable, "-m", "pip", "install", "--quiet",
|
|
74
|
+
"transformers", "torch", "accelerate", "sentencepiece"]
|
|
75
|
+
)
|
|
76
|
+
except subprocess.CalledProcessError as e:
|
|
77
|
+
if verbose:
|
|
78
|
+
print(
|
|
79
|
+
f"[CatLLM] Failed to install formatter dependencies ({e}).\n"
|
|
80
|
+
" Install manually: pip install 'cat-llm[formatter]'"
|
|
81
|
+
)
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
# Verify import works now
|
|
85
|
+
try:
|
|
86
|
+
import torch # noqa: F401
|
|
87
|
+
import transformers # noqa: F401
|
|
88
|
+
return True
|
|
89
|
+
except ImportError as e:
|
|
90
|
+
if verbose:
|
|
91
|
+
print(f"[CatLLM] Formatter deps installed but import failed: {e}")
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
|
|
45
95
|
def _is_model_cached() -> bool:
|
|
46
96
|
"""Check if the merged model is already in the HuggingFace cache."""
|
|
47
97
|
try:
|
|
@@ -54,31 +104,29 @@ def _is_model_cached() -> bool:
|
|
|
54
104
|
|
|
55
105
|
def ensure_formatter_available() -> bool:
|
|
56
106
|
"""
|
|
57
|
-
Ensure the formatter model
|
|
107
|
+
Ensure the formatter model and its Python dependencies are available.
|
|
108
|
+
|
|
109
|
+
Auto-installs deps (transformers/torch/accelerate, ~1.5 GB) on first use
|
|
110
|
+
and auto-downloads the formatter model (~1 GB) from HuggingFace on first
|
|
111
|
+
use. Both events print a clear warning to the console; neither prompts
|
|
112
|
+
interactively, so this function is safe to call from Rscript / non-TTY
|
|
113
|
+
sessions.
|
|
58
114
|
|
|
59
115
|
Returns:
|
|
60
|
-
True if the formatter is ready to use, False
|
|
116
|
+
True if the formatter is ready to use, False on install failure.
|
|
61
117
|
"""
|
|
62
|
-
|
|
118
|
+
if not _ensure_dependencies():
|
|
119
|
+
return False
|
|
63
120
|
|
|
64
121
|
if _is_model_cached():
|
|
65
122
|
return True
|
|
66
123
|
|
|
67
124
|
print(
|
|
68
|
-
"\n[CatLLM]
|
|
125
|
+
"\n[CatLLM] Downloading JSON formatter model (~1 GB) from\n"
|
|
69
126
|
f" HuggingFace Hub ({_MERGED_MODEL_REPO}).\n"
|
|
70
127
|
" This is a one-time download — the model is cached locally after."
|
|
71
128
|
)
|
|
72
|
-
|
|
73
|
-
answer = input(" Continue? (Y/n): ").strip().lower()
|
|
74
|
-
except (EOFError, KeyboardInterrupt):
|
|
75
|
-
answer = "n"
|
|
76
|
-
|
|
77
|
-
if answer in ("", "y", "yes"):
|
|
78
|
-
return True
|
|
79
|
-
else:
|
|
80
|
-
print(" -> JSON formatter disabled for this run.\n")
|
|
81
|
-
return False
|
|
129
|
+
return True # actual download happens in load_formatter()
|
|
82
130
|
|
|
83
131
|
|
|
84
132
|
def load_formatter(device=None):
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Domain-keyed prompt registry.
|
|
3
|
+
|
|
4
|
+
cat-stack's extract() and explore() pipelines use two LLM prompts: a
|
|
5
|
+
*first-pass* per-chunk extraction prompt and a *second-pass* semantic
|
|
6
|
+
*merge* prompt. The wording of each is domain-shaped — survey responses
|
|
7
|
+
read differently than social-media posts or academic papers.
|
|
8
|
+
|
|
9
|
+
This module centralises every variant in one place. Domain-specific
|
|
10
|
+
sub-packages (cat-survey, cat-vader, cat-ademic, cat-pol, cat-web) call
|
|
11
|
+
catstack.extract/explore with `domain="<key>"` to select the appropriate
|
|
12
|
+
variant. The default is `"neutral"`, which contains no domain-shaped
|
|
13
|
+
language so direct catstack callers get generic prompts.
|
|
14
|
+
|
|
15
|
+
A domain only needs to override the slots that genuinely differ from
|
|
16
|
+
neutral; unspecified slots fall back to neutral via `get_prompt`.
|
|
17
|
+
|
|
18
|
+
Template placeholders:
|
|
19
|
+
|
|
20
|
+
first_pass — {categories_per_chunk} {specificity} {context}
|
|
21
|
+
{focus_text} {items_blob}
|
|
22
|
+
merge — {context} {max_categories} {name_instruction}
|
|
23
|
+
{seed_with_counts}
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# Generic, domain-neutral templates. Used directly when the caller does
|
|
27
|
+
# not pass a domain, and used as the fallback for any slot a domain does
|
|
28
|
+
# not override.
|
|
29
|
+
_NEUTRAL_FIRST_PASS = (
|
|
30
|
+
'Identify {categories_per_chunk} {specificity} categories present in '
|
|
31
|
+
'the following texts about: "{context}".{focus_text} '
|
|
32
|
+
"Items are separated by semicolons. "
|
|
33
|
+
"Items are within triple backticks: ```{items_blob}``` "
|
|
34
|
+
"Number your categories from 1 through {categories_per_chunk} and "
|
|
35
|
+
"provide concise labels only (no descriptions)."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
_NEUTRAL_MERGE = """
|
|
39
|
+
You are consolidating categories extracted from a collection of texts about: "{context}"
|
|
40
|
+
|
|
41
|
+
Task: Reduce to {max_categories} categories.
|
|
42
|
+
|
|
43
|
+
Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct concept or theme. Categories that describe the same concept using different words or from different angles belong in the same cluster. For example, a category about "battery life" and a category about "charge duration" likely belong together if they reflect the same underlying concept.
|
|
44
|
+
|
|
45
|
+
Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
|
|
46
|
+
|
|
47
|
+
Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
|
|
48
|
+
|
|
49
|
+
Categories (sorted by extraction frequency):
|
|
50
|
+
{seed_with_counts}
|
|
51
|
+
|
|
52
|
+
Return ONLY a numbered list of {max_categories} categories. Each line must follow this exact format:
|
|
53
|
+
N. Category Label (such as example 1, example 2, example 3)
|
|
54
|
+
|
|
55
|
+
Example:
|
|
56
|
+
1. Financial Pressures (such as rising costs, budget constraints, or loss of income)
|
|
57
|
+
2. Location or Environment (such as moving to a new city, neighborhood quality, or proximity to amenities)
|
|
58
|
+
""".strip()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Survey: the historical cat-stack prompt, verbatim. "respondent" /
|
|
62
|
+
# "reason" language preserved.
|
|
63
|
+
_SURVEY_FIRST_PASS = (
|
|
64
|
+
'Identify {categories_per_chunk} {specificity} categories of responses '
|
|
65
|
+
'to the question "{context}" in the following list of responses.{focus_text} '
|
|
66
|
+
"Responses are separated by semicolons. "
|
|
67
|
+
"Responses are within triple backticks: ```{items_blob}``` "
|
|
68
|
+
"Number your categories from 1 through {categories_per_chunk} and "
|
|
69
|
+
"provide concise labels only (no descriptions)."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
_SURVEY_MERGE = """
|
|
73
|
+
You are consolidating categories extracted from survey responses to: "{context}"
|
|
74
|
+
|
|
75
|
+
Task: Reduce to {max_categories} categories.
|
|
76
|
+
|
|
77
|
+
Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct reason a respondent might give. Categories that describe the same reason using different words or from different angles belong in the same cluster. For example, a category about relationship quality and a category about emotional closeness likely belong together if they reflect the same underlying reason.
|
|
78
|
+
|
|
79
|
+
Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
|
|
80
|
+
|
|
81
|
+
Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
|
|
82
|
+
|
|
83
|
+
Categories (sorted by extraction frequency):
|
|
84
|
+
{seed_with_counts}
|
|
85
|
+
|
|
86
|
+
Return ONLY a numbered list of {max_categories} categories. Each line must follow this exact format:
|
|
87
|
+
N. Category Label (such as example 1, example 2, example 3)
|
|
88
|
+
|
|
89
|
+
Example:
|
|
90
|
+
1. Financial Pressures (such as rising rent, job loss, or inability to afford housing)
|
|
91
|
+
2. Proximity to Family (such as moving closer to parents, children, or extended relatives)
|
|
92
|
+
""".strip()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
_SOCIAL_MERGE = """
|
|
96
|
+
You are consolidating categories extracted from social-media posts about: "{context}"
|
|
97
|
+
|
|
98
|
+
Task: Reduce to {max_categories} categories.
|
|
99
|
+
|
|
100
|
+
Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct topic, sentiment, or behaviour expressed in the posts. Categories that describe the same underlying message using different wording, slang, or hashtags belong in the same cluster. For example, a category about "product praise" and a category about "positive recommendation" likely belong together if they reflect the same underlying sentiment.
|
|
101
|
+
|
|
102
|
+
Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
|
|
103
|
+
|
|
104
|
+
Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
|
|
105
|
+
|
|
106
|
+
Categories (sorted by extraction frequency):
|
|
107
|
+
{seed_with_counts}
|
|
108
|
+
|
|
109
|
+
Return ONLY a numbered list of {max_categories} categories.
|
|
110
|
+
""".strip()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
_ACADEMIC_MERGE = """
|
|
114
|
+
You are consolidating categories extracted from academic texts about: "{context}"
|
|
115
|
+
|
|
116
|
+
Task: Reduce to {max_categories} categories.
|
|
117
|
+
|
|
118
|
+
Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct research theme, method, or finding. Categories that describe the same scholarly concept using different terminology or framings belong in the same cluster. For example, a category about "longitudinal cohort analysis" and a category about "panel data study design" likely belong together if they reflect the same underlying research approach.
|
|
119
|
+
|
|
120
|
+
Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
|
|
121
|
+
|
|
122
|
+
Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
|
|
123
|
+
|
|
124
|
+
Categories (sorted by extraction frequency):
|
|
125
|
+
{seed_with_counts}
|
|
126
|
+
|
|
127
|
+
Return ONLY a numbered list of {max_categories} categories.
|
|
128
|
+
""".strip()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
_POLICY_MERGE = """
|
|
132
|
+
You are consolidating categories extracted from policy documents about: "{context}"
|
|
133
|
+
|
|
134
|
+
Task: Reduce to {max_categories} categories.
|
|
135
|
+
|
|
136
|
+
Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct policy area, provision, or government action. Categories that describe the same provision using different statutory language or framings belong in the same cluster. For example, a category about "Medicaid eligibility expansion" and a category about "low-income healthcare coverage extension" likely belong together if they reflect the same underlying policy mechanism.
|
|
137
|
+
|
|
138
|
+
Step 2 — Label: For each cluster, choose the single label that best captures the policy area or provision. {name_instruction}
|
|
139
|
+
|
|
140
|
+
Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
|
|
141
|
+
|
|
142
|
+
Categories (sorted by extraction frequency):
|
|
143
|
+
{seed_with_counts}
|
|
144
|
+
|
|
145
|
+
Return ONLY a numbered list of {max_categories} categories.
|
|
146
|
+
""".strip()
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
_WEB_MERGE = """
|
|
150
|
+
You are consolidating categories extracted from web content about: "{context}"
|
|
151
|
+
|
|
152
|
+
Task: Reduce to {max_categories} categories.
|
|
153
|
+
|
|
154
|
+
Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct topic, claim, or content type. Categories that describe the same web content using different headlines or framings belong in the same cluster. For example, a category about "product reviews" and a category about "consumer evaluations" likely belong together if they reflect the same underlying content type.
|
|
155
|
+
|
|
156
|
+
Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
|
|
157
|
+
|
|
158
|
+
Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
|
|
159
|
+
|
|
160
|
+
Categories (sorted by extraction frequency):
|
|
161
|
+
{seed_with_counts}
|
|
162
|
+
|
|
163
|
+
Return ONLY a numbered list of {max_categories} categories.
|
|
164
|
+
""".strip()
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
PROMPTS = {
|
|
168
|
+
"neutral": {
|
|
169
|
+
"first_pass": _NEUTRAL_FIRST_PASS,
|
|
170
|
+
"merge": _NEUTRAL_MERGE,
|
|
171
|
+
},
|
|
172
|
+
"survey": {
|
|
173
|
+
"first_pass": _SURVEY_FIRST_PASS,
|
|
174
|
+
"merge": _SURVEY_MERGE,
|
|
175
|
+
},
|
|
176
|
+
"social": {
|
|
177
|
+
# first_pass inherits from neutral
|
|
178
|
+
"merge": _SOCIAL_MERGE,
|
|
179
|
+
},
|
|
180
|
+
"academic": {
|
|
181
|
+
"merge": _ACADEMIC_MERGE,
|
|
182
|
+
},
|
|
183
|
+
"policy": {
|
|
184
|
+
"merge": _POLICY_MERGE,
|
|
185
|
+
},
|
|
186
|
+
"web": {
|
|
187
|
+
"merge": _WEB_MERGE,
|
|
188
|
+
},
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def get_prompt(domain: str, slot: str) -> str:
|
|
193
|
+
"""Look up a prompt slot for a domain, falling back to 'neutral'.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
domain: A key in PROMPTS (e.g. "neutral", "survey", "social",
|
|
197
|
+
"academic", "policy", "web"). Unknown domains fall through
|
|
198
|
+
to neutral.
|
|
199
|
+
slot: "first_pass" or "merge".
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
The template string, with f-string-style {placeholder} markers
|
|
203
|
+
that the caller fills via str.format(**kwargs).
|
|
204
|
+
"""
|
|
205
|
+
return PROMPTS.get(domain, {}).get(slot) or PROMPTS["neutral"][slot]
|
|
@@ -50,9 +50,9 @@ def _clean_label(label: str) -> str:
|
|
|
50
50
|
"Emotional Support: 3" -> "Emotional Support"
|
|
51
51
|
"emotional support" -> "emotional support"
|
|
52
52
|
"""
|
|
53
|
-
label = label.replace("**", "")
|
|
54
|
-
label = re.sub(r"\s*\(
|
|
55
|
-
label = re.sub(r"\s*:\s*\d+\s*$", "", label)
|
|
53
|
+
label = label.replace("**", "") # remove bold markers
|
|
54
|
+
label = re.sub(r"\s*\(\s*\d+\s*\)", "", label) # remove count-only parens like "(3)"
|
|
55
|
+
label = re.sub(r"\s*:\s*\d+\s*$", "", label) # remove trailing ": N" counts
|
|
56
56
|
return label.strip()
|
|
57
57
|
|
|
58
58
|
|
|
@@ -7,7 +7,7 @@ supporting both single-model and multi-model (ensemble) classification.
|
|
|
7
7
|
|
|
8
8
|
import math
|
|
9
9
|
import warnings
|
|
10
|
-
from typing import Union, Callable
|
|
10
|
+
from typing import Union, Callable, Optional
|
|
11
11
|
|
|
12
12
|
__all__ = [
|
|
13
13
|
# Main entry point
|
|
@@ -91,7 +91,7 @@ def classify(
|
|
|
91
91
|
auto_download: bool = False,
|
|
92
92
|
add_other = "prompt",
|
|
93
93
|
check_verbosity: bool = True,
|
|
94
|
-
json_formatter: bool =
|
|
94
|
+
json_formatter: Optional[bool] = None,
|
|
95
95
|
embeddings: bool = False,
|
|
96
96
|
category_descriptions: dict = None,
|
|
97
97
|
embedding_tiebreaker: bool = False,
|
|
@@ -532,19 +532,51 @@ def classify(
|
|
|
532
532
|
print()
|
|
533
533
|
|
|
534
534
|
# =========================================================================
|
|
535
|
-
# JSON formatter fallback
|
|
535
|
+
# JSON formatter fallback
|
|
536
536
|
# =========================================================================
|
|
537
|
+
# Auto-enable when Ollama (or any local model with colon-tag syntax) is in
|
|
538
|
+
# use, since small local models more often emit malformed classification
|
|
539
|
+
# JSON. Pass json_formatter=False explicitly to opt out.
|
|
540
|
+
def _uses_ollama_provider():
|
|
541
|
+
ms = (model_source or "").lower()
|
|
542
|
+
if ms == "ollama":
|
|
543
|
+
return True
|
|
544
|
+
if models:
|
|
545
|
+
for m in models:
|
|
546
|
+
provider = None
|
|
547
|
+
if isinstance(m, (list, tuple)) and len(m) >= 2:
|
|
548
|
+
provider = m[1]
|
|
549
|
+
elif isinstance(m, dict):
|
|
550
|
+
provider = m.get("provider")
|
|
551
|
+
if provider and str(provider).lower() == "ollama":
|
|
552
|
+
return True
|
|
553
|
+
return False
|
|
554
|
+
|
|
555
|
+
if json_formatter is None:
|
|
556
|
+
json_formatter = _uses_ollama_provider()
|
|
557
|
+
if json_formatter:
|
|
558
|
+
print(
|
|
559
|
+
"\n[CatLLM] Ollama detected — auto-enabling JSON formatter fallback\n"
|
|
560
|
+
" (small local models more often emit malformed JSON).\n"
|
|
561
|
+
" Pass json_formatter=False to opt out."
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
# The formatter MODEL is loaded lazily on the first parse failure (saves
|
|
565
|
+
# ~1 GB RAM + load time when no rows actually need rescuing). The dep
|
|
566
|
+
# check + cache verification still run upfront -- that's the fast part
|
|
567
|
+
# and lets us cleanly disable the formatter if deps can't be installed.
|
|
537
568
|
_formatter_state = None
|
|
538
569
|
if json_formatter:
|
|
539
570
|
try:
|
|
540
571
|
from ._formatter import ensure_formatter_available, load_formatter
|
|
541
572
|
|
|
542
573
|
if ensure_formatter_available():
|
|
543
|
-
fmt_model, fmt_tokenizer, fmt_device = load_formatter()
|
|
544
574
|
_formatter_state = {
|
|
545
|
-
"model":
|
|
546
|
-
"tokenizer":
|
|
547
|
-
"device":
|
|
575
|
+
"model": None,
|
|
576
|
+
"tokenizer": None,
|
|
577
|
+
"device": None,
|
|
578
|
+
"_loaded": False,
|
|
579
|
+
"_loader": load_formatter,
|
|
548
580
|
}
|
|
549
581
|
else:
|
|
550
582
|
json_formatter = False
|
|
@@ -34,6 +34,7 @@ def explore(
|
|
|
34
34
|
chunk_delay: float = 0.0,
|
|
35
35
|
auto_download: bool = False,
|
|
36
36
|
max_workers: int = 1,
|
|
37
|
+
domain: str = "neutral",
|
|
37
38
|
):
|
|
38
39
|
"""
|
|
39
40
|
Explore categories in text data, returning the raw extracted list.
|
|
@@ -107,6 +108,7 @@ def explore(
|
|
|
107
108
|
chunk_delay=chunk_delay,
|
|
108
109
|
auto_download=auto_download,
|
|
109
110
|
max_workers=max_workers,
|
|
111
|
+
domain=domain,
|
|
110
112
|
)
|
|
111
113
|
|
|
112
114
|
if filename:
|
|
@@ -60,6 +60,7 @@ def extract(
|
|
|
60
60
|
chunk_delay: float = 0.0,
|
|
61
61
|
auto_download: bool = False,
|
|
62
62
|
input_mode=None,
|
|
63
|
+
domain: str = "neutral",
|
|
63
64
|
):
|
|
64
65
|
"""
|
|
65
66
|
Unified category extraction function for text, image, and PDF inputs.
|
|
@@ -175,6 +176,7 @@ def extract(
|
|
|
175
176
|
progress_callback=progress_callback,
|
|
176
177
|
chunk_delay=chunk_delay,
|
|
177
178
|
auto_download=auto_download,
|
|
179
|
+
domain=domain,
|
|
178
180
|
)
|
|
179
181
|
|
|
180
182
|
elif input_type == "image":
|
|
@@ -73,6 +73,7 @@ from ._providers import (
|
|
|
73
73
|
OLLAMA_MODEL_SIZES,
|
|
74
74
|
)
|
|
75
75
|
from ._utils import _clean_label
|
|
76
|
+
from ._prompts import get_prompt
|
|
76
77
|
|
|
77
78
|
|
|
78
79
|
# =============================================================================
|
|
@@ -525,6 +526,7 @@ def explore_common_categories(
|
|
|
525
526
|
chunk_delay: float = 0.0,
|
|
526
527
|
auto_download: bool = False,
|
|
527
528
|
max_workers: int = 1,
|
|
529
|
+
domain: str = "neutral",
|
|
528
530
|
# Legacy parameter names for backward compatibility
|
|
529
531
|
user_model: str = None,
|
|
530
532
|
model_source: str = None,
|
|
@@ -687,13 +689,16 @@ def explore_common_categories(
|
|
|
687
689
|
else:
|
|
688
690
|
system_content = "You are a helpful assistant that extracts categories from text responses."
|
|
689
691
|
|
|
692
|
+
first_pass_template = get_prompt(domain, "first_pass")
|
|
693
|
+
|
|
690
694
|
def make_prompt(responses_blob: str) -> str:
|
|
691
695
|
focus_text = f" Focus specifically on {focus}." if focus else ""
|
|
692
|
-
return (
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
696
|
+
return first_pass_template.format(
|
|
697
|
+
categories_per_chunk=categories_per_chunk,
|
|
698
|
+
specificity=specificity,
|
|
699
|
+
context=survey_question,
|
|
700
|
+
focus_text=focus_text,
|
|
701
|
+
items_blob=responses_blob,
|
|
697
702
|
)
|
|
698
703
|
|
|
699
704
|
# Parse numbered list
|
|
@@ -849,31 +854,24 @@ def explore_common_categories(
|
|
|
849
854
|
|
|
850
855
|
if specificity == "specific":
|
|
851
856
|
name_instruction = (
|
|
852
|
-
"
|
|
853
|
-
"Each category name
|
|
854
|
-
"'such as' or parenthetical examples
|
|
857
|
+
"Use specific, descriptive labels. "
|
|
858
|
+
"Each category name MUST include a clarifying phrase using "
|
|
859
|
+
"'such as' or parenthetical examples."
|
|
855
860
|
)
|
|
856
861
|
else:
|
|
857
862
|
name_instruction = (
|
|
858
|
-
"Prefer specific, descriptive labels over vague ones."
|
|
863
|
+
"Prefer specific, descriptive labels over vague ones. "
|
|
864
|
+
"Each category name SHOULD include a brief clarifying phrase using "
|
|
865
|
+
"'such as' or parenthetical examples where helpful."
|
|
859
866
|
)
|
|
860
867
|
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
|
|
869
|
-
|
|
870
|
-
Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
|
|
871
|
-
|
|
872
|
-
Categories (sorted by extraction frequency):
|
|
873
|
-
{seed_with_counts}
|
|
874
|
-
|
|
875
|
-
Return ONLY a numbered list of {max_categories} categories.
|
|
876
|
-
""".strip()
|
|
868
|
+
merge_template = get_prompt(domain, "merge")
|
|
869
|
+
second_prompt = merge_template.format(
|
|
870
|
+
context=survey_context,
|
|
871
|
+
max_categories=max_categories,
|
|
872
|
+
name_instruction=name_instruction,
|
|
873
|
+
seed_with_counts=seed_with_counts,
|
|
874
|
+
)
|
|
877
875
|
|
|
878
876
|
# Second pass call
|
|
879
877
|
reply2, error2 = client.complete(
|
|
@@ -2657,6 +2657,10 @@ Categorize text responses {cove_categorize}:
|
|
|
2657
2657
|
def _try_formatter_fallback(json_result, raw_reply, chunk_categories=None):
|
|
2658
2658
|
"""Try the JSON formatter if extract_json produced invalid output.
|
|
2659
2659
|
|
|
2660
|
+
Lazily loads the formatter model into RAM the first time this helper
|
|
2661
|
+
is invoked with a real failure -- saves ~1 GB RAM + load time when
|
|
2662
|
+
every row parses cleanly on the first try.
|
|
2663
|
+
|
|
2660
2664
|
Args:
|
|
2661
2665
|
chunk_categories: When called from chunked classification, the
|
|
2662
2666
|
actual chunk category list (not the full list). Needed so the
|
|
@@ -2669,6 +2673,19 @@ Categorize text responses {cove_categorize}:
|
|
|
2669
2673
|
is_valid, _ = validate_classification_json(json_result, n)
|
|
2670
2674
|
if is_valid:
|
|
2671
2675
|
return json_result
|
|
2676
|
+
|
|
2677
|
+
# Lazy load on first need
|
|
2678
|
+
if not formatter_state.get("_loaded"):
|
|
2679
|
+
print(
|
|
2680
|
+
"\n[CatLLM] First malformed-JSON row encountered -- loading\n"
|
|
2681
|
+
" JSON formatter model into RAM now (one-time per session)."
|
|
2682
|
+
)
|
|
2683
|
+
fmt_model, fmt_tokenizer, fmt_device = formatter_state["_loader"]()
|
|
2684
|
+
formatter_state["model"] = fmt_model
|
|
2685
|
+
formatter_state["tokenizer"] = fmt_tokenizer
|
|
2686
|
+
formatter_state["device"] = fmt_device
|
|
2687
|
+
formatter_state["_loaded"] = True
|
|
2688
|
+
|
|
2672
2689
|
from ._formatter import run_formatter
|
|
2673
2690
|
fixed_output = run_formatter(
|
|
2674
2691
|
raw_reply, cats,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/overlapping_pentagons.png
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|