cat-stack 1.0.18__tar.gz → 1.0.22__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {cat_stack-1.0.18 → cat_stack-1.0.22}/PKG-INFO +2 -2
  2. {cat_stack-1.0.18 → cat_stack-1.0.22}/README.md +1 -1
  3. {cat_stack-1.0.18 → cat_stack-1.0.22}/pyproject.toml +12 -10
  4. cat_stack-1.0.22/src/cat_stack/__init__.py +18 -0
  5. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/__about__.py +1 -1
  6. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_formatter.py +62 -14
  7. cat_stack-1.0.22/src/catstack/_prompts.py +205 -0
  8. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_utils.py +3 -3
  9. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/classify.py +39 -7
  10. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/explore.py +2 -0
  11. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/extract.py +2 -0
  12. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/text_functions.py +23 -25
  13. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/text_functions_ensemble.py +17 -0
  14. {cat_stack-1.0.18 → cat_stack-1.0.22}/.gitignore +0 -0
  15. {cat_stack-1.0.18 → cat_stack-1.0.22}/LICENSE +0 -0
  16. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/__init__.py +0 -0
  17. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_batch.py +0 -0
  18. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_category_analysis.py +0 -0
  19. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_chunked.py +0 -0
  20. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_embeddings.py +0 -0
  21. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_pilot_test.py +0 -0
  22. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_providers.py +0 -0
  23. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_review_ui.py +0 -0
  24. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_tiebreaker.py +0 -0
  25. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/_web_fetch.py +0 -0
  26. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/CoVe.py +0 -0
  27. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/__init__.py +0 -0
  28. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/all_calls.py +0 -0
  29. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/image_CoVe.py +0 -0
  30. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/image_stepback.py +0 -0
  31. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/pdf_CoVe.py +0 -0
  32. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/pdf_stepback.py +0 -0
  33. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/stepback.py +0 -0
  34. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/calls/top_n.py +0 -0
  35. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/image_functions.py +0 -0
  36. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/circle.png +0 -0
  37. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/cube.png +0 -0
  38. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/diamond.png +0 -0
  39. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/overlapping_pentagons.png +0 -0
  40. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/images/rectangles.png +0 -0
  41. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/model_reference_list.py +0 -0
  42. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/pdf_functions.py +0 -0
  43. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/prompt_tune.py +0 -0
  44. {cat_stack-1.0.18/src/cat_stack → cat_stack-1.0.22/src/catstack}/summarize.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 1.0.18
3
+ Version: 1.0.22
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -73,7 +73,7 @@ Installing `cat-llm` pulls in all of the above.
73
73
  ## Quick Start
74
74
 
75
75
  ```python
76
- import cat_stack as cat
76
+ import catstack as cat
77
77
 
78
78
  # Classify text into predefined categories
79
79
  result = cat.classify(
@@ -36,7 +36,7 @@ Installing `cat-llm` pulls in all of the above.
36
36
  ## Quick Start
37
37
 
38
38
  ```python
39
- import cat_stack as cat
39
+ import catstack as cat
40
40
 
41
41
  # Classify text into predefined categories
42
42
  result = cat.classify(
@@ -43,38 +43,40 @@ Issues = "https://github.com/chrissoria/cat-stack/issues"
43
43
  Source = "https://github.com/chrissoria/cat-stack"
44
44
 
45
45
  [tool.hatch.version]
46
- path = "src/cat_stack/__about__.py"
46
+ path = "src/catstack/__about__.py"
47
47
 
48
48
  [tool.hatch.envs.types]
49
49
  extra-dependencies = [
50
50
  "mypy>=1.0.0",
51
51
  ]
52
52
  [tool.hatch.envs.types.scripts]
53
- check = "mypy --install-types --non-interactive {args:src/cat_stack tests}"
53
+ check = "mypy --install-types --non-interactive {args:src/catstack tests}"
54
54
 
55
55
  [tool.hatch.build.targets.wheel]
56
- packages = ["src/cat_stack"]
56
+ packages = ["src/catstack", "src/cat_stack"]
57
57
  include = [
58
- "src/cat_stack/**/*.py",
59
- "src/cat_stack/images/*",
58
+ "src/catstack/**/*.py",
59
+ "src/catstack/images/*",
60
+ "src/cat_stack/__init__.py",
60
61
  ]
61
62
 
62
63
  [tool.hatch.build.targets.sdist]
63
64
  include = [
64
- "src/cat_stack/**/*.py",
65
- "src/cat_stack/images/*",
65
+ "src/catstack/**/*.py",
66
+ "src/catstack/images/*",
67
+ "src/cat_stack/__init__.py",
66
68
  ]
67
69
 
68
70
  [tool.coverage.run]
69
- source_pkgs = ["cat_stack", "tests"]
71
+ source_pkgs = ["catstack", "tests"]
70
72
  branch = true
71
73
  parallel = true
72
74
  omit = [
73
- "src/cat_stack/__about__.py",
75
+ "src/catstack/__about__.py",
74
76
  ]
75
77
 
76
78
  [tool.coverage.paths]
77
- cat_stack = ["src/cat_stack", "*/cat-stack/src/cat_stack"]
79
+ catstack = ["src/catstack", "*/cat-stack/src/catstack"]
78
80
  tests = ["tests", "*/cat-stack/tests"]
79
81
 
80
82
  [tool.coverage.report]
@@ -0,0 +1,18 @@
1
+ """Back-compat alias for `catstack`.
2
+
3
+ The canonical import name is `catstack`. `cat_stack` is retained so existing
4
+ code continues to work; prefer `catstack` in new code.
5
+ """
6
+ import importlib
7
+ import sys
8
+
9
+ _canonical = "catstack"
10
+ _real = importlib.import_module(_canonical)
11
+
12
+ sys.modules[__name__] = _real
13
+
14
+ _src_prefix = _canonical + "."
15
+ _dst_prefix = __name__ + "."
16
+ for _name in list(sys.modules):
17
+ if _name.startswith(_src_prefix):
18
+ sys.modules[_dst_prefix + _name[len(_src_prefix):]] = sys.modules[_name]
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "1.0.18"
4
+ __version__ = "1.0.22"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -42,6 +42,56 @@ def _check_dependencies():
42
42
  )
43
43
 
44
44
 
45
+ def _ensure_dependencies(verbose: bool = True) -> bool:
46
+ """Ensure formatter Python dependencies are installed.
47
+
48
+ Tries to import torch/transformers/accelerate. If any are missing,
49
+ auto-installs them via pip after printing a clear warning about the
50
+ download size (~1.5 GB total). Returns True on success, False on
51
+ install failure.
52
+ """
53
+ try:
54
+ import torch # noqa: F401
55
+ import transformers # noqa: F401
56
+ import accelerate # noqa: F401
57
+ return True
58
+ except ImportError:
59
+ pass
60
+
61
+ if verbose:
62
+ print(
63
+ "\n[CatLLM] JSON formatter dependencies (transformers, torch, "
64
+ "accelerate)\n"
65
+ " are not installed in this Python environment. Installing now\n"
66
+ " (~1.5 GB download; one-time). To skip this and disable the\n"
67
+ " formatter, pass json_formatter=False."
68
+ )
69
+
70
+ import subprocess
71
+ try:
72
+ subprocess.check_call(
73
+ [sys.executable, "-m", "pip", "install", "--quiet",
74
+ "transformers", "torch", "accelerate", "sentencepiece"]
75
+ )
76
+ except subprocess.CalledProcessError as e:
77
+ if verbose:
78
+ print(
79
+ f"[CatLLM] Failed to install formatter dependencies ({e}).\n"
80
+ " Install manually: pip install 'cat-llm[formatter]'"
81
+ )
82
+ return False
83
+
84
+ # Verify import works now
85
+ try:
86
+ import torch # noqa: F401
87
+ import transformers # noqa: F401
88
+ return True
89
+ except ImportError as e:
90
+ if verbose:
91
+ print(f"[CatLLM] Formatter deps installed but import failed: {e}")
92
+ return False
93
+
94
+
45
95
  def _is_model_cached() -> bool:
46
96
  """Check if the merged model is already in the HuggingFace cache."""
47
97
  try:
@@ -54,31 +104,29 @@ def _is_model_cached() -> bool:
54
104
 
55
105
  def ensure_formatter_available() -> bool:
56
106
  """
57
- Ensure the formatter model is available, prompting to download if needed.
107
+ Ensure the formatter model and its Python dependencies are available.
108
+
109
+ Auto-installs deps (transformers/torch/accelerate, ~1.5 GB) on first use
110
+ and auto-downloads the formatter model (~1 GB) from HuggingFace on first
111
+ use. Both events print a clear warning to the console; neither prompts
112
+ interactively, so this function is safe to call from Rscript / non-TTY
113
+ sessions.
58
114
 
59
115
  Returns:
60
- True if the formatter is ready to use, False if user declined download.
116
+ True if the formatter is ready to use, False on install failure.
61
117
  """
62
- _check_dependencies()
118
+ if not _ensure_dependencies():
119
+ return False
63
120
 
64
121
  if _is_model_cached():
65
122
  return True
66
123
 
67
124
  print(
68
- "\n[CatLLM] The JSON formatter model (~1GB) will be downloaded from\n"
125
+ "\n[CatLLM] Downloading JSON formatter model (~1 GB) from\n"
69
126
  f" HuggingFace Hub ({_MERGED_MODEL_REPO}).\n"
70
127
  " This is a one-time download — the model is cached locally after."
71
128
  )
72
- try:
73
- answer = input(" Continue? (Y/n): ").strip().lower()
74
- except (EOFError, KeyboardInterrupt):
75
- answer = "n"
76
-
77
- if answer in ("", "y", "yes"):
78
- return True
79
- else:
80
- print(" -> JSON formatter disabled for this run.\n")
81
- return False
129
+ return True # actual download happens in load_formatter()
82
130
 
83
131
 
84
132
  def load_formatter(device=None):
@@ -0,0 +1,205 @@
1
+ """
2
+ Domain-keyed prompt registry.
3
+
4
+ cat-stack's extract() and explore() pipelines use two LLM prompts: a
5
+ *first-pass* per-chunk extraction prompt and a *second-pass* semantic
6
+ *merge* prompt. The wording of each is domain-shaped — survey responses
7
+ read differently than social-media posts or academic papers.
8
+
9
+ This module centralises every variant in one place. Domain-specific
10
+ sub-packages (cat-survey, cat-vader, cat-ademic, cat-pol, cat-web) call
11
+ catstack.extract/explore with `domain="<key>"` to select the appropriate
12
+ variant. The default is `"neutral"`, which contains no domain-shaped
13
+ language so direct catstack callers get generic prompts.
14
+
15
+ A domain only needs to override the slots that genuinely differ from
16
+ neutral; unspecified slots fall back to neutral via `get_prompt`.
17
+
18
+ Template placeholders:
19
+
20
+ first_pass — {categories_per_chunk} {specificity} {context}
21
+ {focus_text} {items_blob}
22
+ merge — {context} {max_categories} {name_instruction}
23
+ {seed_with_counts}
24
+ """
25
+
26
+ # Generic, domain-neutral templates. Used directly when the caller does
27
+ # not pass a domain, and used as the fallback for any slot a domain does
28
+ # not override.
29
+ _NEUTRAL_FIRST_PASS = (
30
+ 'Identify {categories_per_chunk} {specificity} categories present in '
31
+ 'the following texts about: "{context}".{focus_text} '
32
+ "Items are separated by semicolons. "
33
+ "Items are within triple backticks: ```{items_blob}``` "
34
+ "Number your categories from 1 through {categories_per_chunk} and "
35
+ "provide concise labels only (no descriptions)."
36
+ )
37
+
38
+ _NEUTRAL_MERGE = """
39
+ You are consolidating categories extracted from a collection of texts about: "{context}"
40
+
41
+ Task: Reduce to {max_categories} categories.
42
+
43
+ Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct concept or theme. Categories that describe the same concept using different words or from different angles belong in the same cluster. For example, a category about "battery life" and a category about "charge duration" likely belong together if they reflect the same underlying concept.
44
+
45
+ Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
46
+
47
+ Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
48
+
49
+ Categories (sorted by extraction frequency):
50
+ {seed_with_counts}
51
+
52
+ Return ONLY a numbered list of {max_categories} categories. Each line must follow this exact format:
53
+ N. Category Label (such as example 1, example 2, example 3)
54
+
55
+ Example:
56
+ 1. Financial Pressures (such as rising costs, budget constraints, or loss of income)
57
+ 2. Location or Environment (such as moving to a new city, neighborhood quality, or proximity to amenities)
58
+ """.strip()
59
+
60
+
61
+ # Survey: the historical cat-stack prompt, verbatim. "respondent" /
62
+ # "reason" language preserved.
63
+ _SURVEY_FIRST_PASS = (
64
+ 'Identify {categories_per_chunk} {specificity} categories of responses '
65
+ 'to the question "{context}" in the following list of responses.{focus_text} '
66
+ "Responses are separated by semicolons. "
67
+ "Responses are within triple backticks: ```{items_blob}``` "
68
+ "Number your categories from 1 through {categories_per_chunk} and "
69
+ "provide concise labels only (no descriptions)."
70
+ )
71
+
72
+ _SURVEY_MERGE = """
73
+ You are consolidating categories extracted from survey responses to: "{context}"
74
+
75
+ Task: Reduce to {max_categories} categories.
76
+
77
+ Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct reason a respondent might give. Categories that describe the same reason using different words or from different angles belong in the same cluster. For example, a category about relationship quality and a category about emotional closeness likely belong together if they reflect the same underlying reason.
78
+
79
+ Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
80
+
81
+ Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
82
+
83
+ Categories (sorted by extraction frequency):
84
+ {seed_with_counts}
85
+
86
+ Return ONLY a numbered list of {max_categories} categories. Each line must follow this exact format:
87
+ N. Category Label (such as example 1, example 2, example 3)
88
+
89
+ Example:
90
+ 1. Financial Pressures (such as rising rent, job loss, or inability to afford housing)
91
+ 2. Proximity to Family (such as moving closer to parents, children, or extended relatives)
92
+ """.strip()
93
+
94
+
95
+ _SOCIAL_MERGE = """
96
+ You are consolidating categories extracted from social-media posts about: "{context}"
97
+
98
+ Task: Reduce to {max_categories} categories.
99
+
100
+ Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct topic, sentiment, or behaviour expressed in the posts. Categories that describe the same underlying message using different wording, slang, or hashtags belong in the same cluster. For example, a category about "product praise" and a category about "positive recommendation" likely belong together if they reflect the same underlying sentiment.
101
+
102
+ Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
103
+
104
+ Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
105
+
106
+ Categories (sorted by extraction frequency):
107
+ {seed_with_counts}
108
+
109
+ Return ONLY a numbered list of {max_categories} categories.
110
+ """.strip()
111
+
112
+
113
+ _ACADEMIC_MERGE = """
114
+ You are consolidating categories extracted from academic texts about: "{context}"
115
+
116
+ Task: Reduce to {max_categories} categories.
117
+
118
+ Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct research theme, method, or finding. Categories that describe the same scholarly concept using different terminology or framings belong in the same cluster. For example, a category about "longitudinal cohort analysis" and a category about "panel data study design" likely belong together if they reflect the same underlying research approach.
119
+
120
+ Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
121
+
122
+ Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
123
+
124
+ Categories (sorted by extraction frequency):
125
+ {seed_with_counts}
126
+
127
+ Return ONLY a numbered list of {max_categories} categories.
128
+ """.strip()
129
+
130
+
131
+ _POLICY_MERGE = """
132
+ You are consolidating categories extracted from policy documents about: "{context}"
133
+
134
+ Task: Reduce to {max_categories} categories.
135
+
136
+ Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct policy area, provision, or government action. Categories that describe the same provision using different statutory language or framings belong in the same cluster. For example, a category about "Medicaid eligibility expansion" and a category about "low-income healthcare coverage extension" likely belong together if they reflect the same underlying policy mechanism.
137
+
138
+ Step 2 — Label: For each cluster, choose the single label that best captures the policy area or provision. {name_instruction}
139
+
140
+ Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
141
+
142
+ Categories (sorted by extraction frequency):
143
+ {seed_with_counts}
144
+
145
+ Return ONLY a numbered list of {max_categories} categories.
146
+ """.strip()
147
+
148
+
149
+ _WEB_MERGE = """
150
+ You are consolidating categories extracted from web content about: "{context}"
151
+
152
+ Task: Reduce to {max_categories} categories.
153
+
154
+ Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct topic, claim, or content type. Categories that describe the same web content using different headlines or framings belong in the same cluster. For example, a category about "product reviews" and a category about "consumer evaluations" likely belong together if they reflect the same underlying content type.
155
+
156
+ Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
157
+
158
+ Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
159
+
160
+ Categories (sorted by extraction frequency):
161
+ {seed_with_counts}
162
+
163
+ Return ONLY a numbered list of {max_categories} categories.
164
+ """.strip()
165
+
166
+
167
+ PROMPTS = {
168
+ "neutral": {
169
+ "first_pass": _NEUTRAL_FIRST_PASS,
170
+ "merge": _NEUTRAL_MERGE,
171
+ },
172
+ "survey": {
173
+ "first_pass": _SURVEY_FIRST_PASS,
174
+ "merge": _SURVEY_MERGE,
175
+ },
176
+ "social": {
177
+ # first_pass inherits from neutral
178
+ "merge": _SOCIAL_MERGE,
179
+ },
180
+ "academic": {
181
+ "merge": _ACADEMIC_MERGE,
182
+ },
183
+ "policy": {
184
+ "merge": _POLICY_MERGE,
185
+ },
186
+ "web": {
187
+ "merge": _WEB_MERGE,
188
+ },
189
+ }
190
+
191
+
192
+ def get_prompt(domain: str, slot: str) -> str:
193
+ """Look up a prompt slot for a domain, falling back to 'neutral'.
194
+
195
+ Args:
196
+ domain: A key in PROMPTS (e.g. "neutral", "survey", "social",
197
+ "academic", "policy", "web"). Unknown domains fall through
198
+ to neutral.
199
+ slot: "first_pass" or "merge".
200
+
201
+ Returns:
202
+ The template string, with f-string-style {placeholder} markers
203
+ that the caller fills via str.format(**kwargs).
204
+ """
205
+ return PROMPTS.get(domain, {}).get(slot) or PROMPTS["neutral"][slot]
@@ -50,9 +50,9 @@ def _clean_label(label: str) -> str:
50
50
  "Emotional Support: 3" -> "Emotional Support"
51
51
  "emotional support" -> "emotional support"
52
52
  """
53
- label = label.replace("**", "") # remove bold markers
54
- label = re.sub(r"\s*\([^)]*\)", "", label) # remove parenthetical notes
55
- label = re.sub(r"\s*:\s*\d+\s*$", "", label) # remove trailing ": N" counts
53
+ label = label.replace("**", "") # remove bold markers
54
+ label = re.sub(r"\s*\(\s*\d+\s*\)", "", label) # remove count-only parens like "(3)"
55
+ label = re.sub(r"\s*:\s*\d+\s*$", "", label) # remove trailing ": N" counts
56
56
  return label.strip()
57
57
 
58
58
 
@@ -7,7 +7,7 @@ supporting both single-model and multi-model (ensemble) classification.
7
7
 
8
8
  import math
9
9
  import warnings
10
- from typing import Union, Callable
10
+ from typing import Union, Callable, Optional
11
11
 
12
12
  __all__ = [
13
13
  # Main entry point
@@ -91,7 +91,7 @@ def classify(
91
91
  auto_download: bool = False,
92
92
  add_other = "prompt",
93
93
  check_verbosity: bool = True,
94
- json_formatter: bool = False,
94
+ json_formatter: Optional[bool] = None,
95
95
  embeddings: bool = False,
96
96
  category_descriptions: dict = None,
97
97
  embedding_tiebreaker: bool = False,
@@ -532,19 +532,51 @@ def classify(
532
532
  print()
533
533
 
534
534
  # =========================================================================
535
- # JSON formatter fallback (opt-in)
535
+ # JSON formatter fallback
536
536
  # =========================================================================
537
+ # Auto-enable when Ollama (or any local model with colon-tag syntax) is in
538
+ # use, since small local models more often emit malformed classification
539
+ # JSON. Pass json_formatter=False explicitly to opt out.
540
+ def _uses_ollama_provider():
541
+ ms = (model_source or "").lower()
542
+ if ms == "ollama":
543
+ return True
544
+ if models:
545
+ for m in models:
546
+ provider = None
547
+ if isinstance(m, (list, tuple)) and len(m) >= 2:
548
+ provider = m[1]
549
+ elif isinstance(m, dict):
550
+ provider = m.get("provider")
551
+ if provider and str(provider).lower() == "ollama":
552
+ return True
553
+ return False
554
+
555
+ if json_formatter is None:
556
+ json_formatter = _uses_ollama_provider()
557
+ if json_formatter:
558
+ print(
559
+ "\n[CatLLM] Ollama detected — auto-enabling JSON formatter fallback\n"
560
+ " (small local models more often emit malformed JSON).\n"
561
+ " Pass json_formatter=False to opt out."
562
+ )
563
+
564
+ # The formatter MODEL is loaded lazily on the first parse failure (saves
565
+ # ~1 GB RAM + load time when no rows actually need rescuing). The dep
566
+ # check + cache verification still run upfront -- that's the fast part
567
+ # and lets us cleanly disable the formatter if deps can't be installed.
537
568
  _formatter_state = None
538
569
  if json_formatter:
539
570
  try:
540
571
  from ._formatter import ensure_formatter_available, load_formatter
541
572
 
542
573
  if ensure_formatter_available():
543
- fmt_model, fmt_tokenizer, fmt_device = load_formatter()
544
574
  _formatter_state = {
545
- "model": fmt_model,
546
- "tokenizer": fmt_tokenizer,
547
- "device": fmt_device,
575
+ "model": None,
576
+ "tokenizer": None,
577
+ "device": None,
578
+ "_loaded": False,
579
+ "_loader": load_formatter,
548
580
  }
549
581
  else:
550
582
  json_formatter = False
@@ -34,6 +34,7 @@ def explore(
34
34
  chunk_delay: float = 0.0,
35
35
  auto_download: bool = False,
36
36
  max_workers: int = 1,
37
+ domain: str = "neutral",
37
38
  ):
38
39
  """
39
40
  Explore categories in text data, returning the raw extracted list.
@@ -107,6 +108,7 @@ def explore(
107
108
  chunk_delay=chunk_delay,
108
109
  auto_download=auto_download,
109
110
  max_workers=max_workers,
111
+ domain=domain,
110
112
  )
111
113
 
112
114
  if filename:
@@ -60,6 +60,7 @@ def extract(
60
60
  chunk_delay: float = 0.0,
61
61
  auto_download: bool = False,
62
62
  input_mode=None,
63
+ domain: str = "neutral",
63
64
  ):
64
65
  """
65
66
  Unified category extraction function for text, image, and PDF inputs.
@@ -175,6 +176,7 @@ def extract(
175
176
  progress_callback=progress_callback,
176
177
  chunk_delay=chunk_delay,
177
178
  auto_download=auto_download,
179
+ domain=domain,
178
180
  )
179
181
 
180
182
  elif input_type == "image":
@@ -73,6 +73,7 @@ from ._providers import (
73
73
  OLLAMA_MODEL_SIZES,
74
74
  )
75
75
  from ._utils import _clean_label
76
+ from ._prompts import get_prompt
76
77
 
77
78
 
78
79
  # =============================================================================
@@ -525,6 +526,7 @@ def explore_common_categories(
525
526
  chunk_delay: float = 0.0,
526
527
  auto_download: bool = False,
527
528
  max_workers: int = 1,
529
+ domain: str = "neutral",
528
530
  # Legacy parameter names for backward compatibility
529
531
  user_model: str = None,
530
532
  model_source: str = None,
@@ -687,13 +689,16 @@ def explore_common_categories(
687
689
  else:
688
690
  system_content = "You are a helpful assistant that extracts categories from text responses."
689
691
 
692
+ first_pass_template = get_prompt(domain, "first_pass")
693
+
690
694
  def make_prompt(responses_blob: str) -> str:
691
695
  focus_text = f" Focus specifically on {focus}." if focus else ""
692
- return (
693
- f'Identify {categories_per_chunk} {specificity} categories of responses to the question "{survey_question}" '
694
- f"in the following list of responses.{focus_text} Responses are separated by semicolons. "
695
- f"Responses are within triple backticks: ```{responses_blob}``` "
696
- f"Number your categories from 1 through {categories_per_chunk} and provide concise labels only (no descriptions)."
696
+ return first_pass_template.format(
697
+ categories_per_chunk=categories_per_chunk,
698
+ specificity=specificity,
699
+ context=survey_question,
700
+ focus_text=focus_text,
701
+ items_blob=responses_blob,
697
702
  )
698
703
 
699
704
  # Parse numbered list
@@ -849,31 +854,24 @@ def explore_common_categories(
849
854
 
850
855
  if specificity == "specific":
851
856
  name_instruction = (
852
- "Prefer specific, descriptive labels over vague ones. "
853
- "Each category name SHOULD include a brief clarifying phrase using "
854
- "'such as' or parenthetical examples where helpful."
857
+ "Use specific, descriptive labels. "
858
+ "Each category name MUST include a clarifying phrase using "
859
+ "'such as' or parenthetical examples."
855
860
  )
856
861
  else:
857
862
  name_instruction = (
858
- "Prefer specific, descriptive labels over vague ones."
863
+ "Prefer specific, descriptive labels over vague ones. "
864
+ "Each category name SHOULD include a brief clarifying phrase using "
865
+ "'such as' or parenthetical examples where helpful."
859
866
  )
860
867
 
861
- second_prompt = f"""
862
- You are consolidating categories extracted from survey responses to: "{survey_context}"
863
-
864
- Task: Reduce to {max_categories} categories.
865
-
866
- Step 1 — Cluster: Group the categories below into clusters where each cluster represents ONE distinct reason a respondent might give. Categories that describe the same reason using different words or from different angles belong in the same cluster. For example, a category about relationship quality and a category about emotional closeness likely belong together if they reflect the same underlying reason.
867
-
868
- Step 2 — Label: For each cluster, choose the single label that best captures the shared meaning. {name_instruction}
869
-
870
- Step 3 — Rank: Sum the frequency counts within each cluster. Output the top {max_categories} clusters by total count.
871
-
872
- Categories (sorted by extraction frequency):
873
- {seed_with_counts}
874
-
875
- Return ONLY a numbered list of {max_categories} categories.
876
- """.strip()
868
+ merge_template = get_prompt(domain, "merge")
869
+ second_prompt = merge_template.format(
870
+ context=survey_context,
871
+ max_categories=max_categories,
872
+ name_instruction=name_instruction,
873
+ seed_with_counts=seed_with_counts,
874
+ )
877
875
 
878
876
  # Second pass call
879
877
  reply2, error2 = client.complete(
@@ -2657,6 +2657,10 @@ Categorize text responses {cove_categorize}:
2657
2657
  def _try_formatter_fallback(json_result, raw_reply, chunk_categories=None):
2658
2658
  """Try the JSON formatter if extract_json produced invalid output.
2659
2659
 
2660
+ Lazily loads the formatter model into RAM the first time this helper
2661
+ is invoked with a real failure -- saves ~1 GB RAM + load time when
2662
+ every row parses cleanly on the first try.
2663
+
2660
2664
  Args:
2661
2665
  chunk_categories: When called from chunked classification, the
2662
2666
  actual chunk category list (not the full list). Needed so the
@@ -2669,6 +2673,19 @@ Categorize text responses {cove_categorize}:
2669
2673
  is_valid, _ = validate_classification_json(json_result, n)
2670
2674
  if is_valid:
2671
2675
  return json_result
2676
+
2677
+ # Lazy load on first need
2678
+ if not formatter_state.get("_loaded"):
2679
+ print(
2680
+ "\n[CatLLM] First malformed-JSON row encountered -- loading\n"
2681
+ " JSON formatter model into RAM now (one-time per session)."
2682
+ )
2683
+ fmt_model, fmt_tokenizer, fmt_device = formatter_state["_loader"]()
2684
+ formatter_state["model"] = fmt_model
2685
+ formatter_state["tokenizer"] = fmt_tokenizer
2686
+ formatter_state["device"] = fmt_device
2687
+ formatter_state["_loaded"] = True
2688
+
2672
2689
  from ._formatter import run_formatter
2673
2690
  fixed_output = run_formatter(
2674
2691
  raw_reply, cats,
File without changes
File without changes