cat-stack 1.6.8__tar.gz → 1.6.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cat_stack-1.6.8 → cat_stack-1.6.9}/.gitignore +8 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/PKG-INFO +1 -1
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/__about__.py +1 -1
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_formatter.py +90 -8
- {cat_stack-1.6.8 → cat_stack-1.6.9}/LICENSE +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/README.md +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/pyproject.toml +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/cat_stack/__init__.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/__init__.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_batch.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_category_analysis.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_chunked.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_embeddings.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_pilot_test.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_prompts.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_providers.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_review_ui.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_tiebreaker.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_utils.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_web_fetch.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/_wrapper_helpers.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/calls/CoVe.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/calls/__init__.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/calls/image_CoVe.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/calls/image_stepback.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/calls/pdf_CoVe.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/calls/pdf_stepback.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/calls/stepback.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/calls/top_n.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/classify.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/explore.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/extract.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/image_functions.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/images/circle.png +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/images/cube.png +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/images/diamond.png +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/images/overlapping_pentagons.png +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/images/rectangles.png +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/model_reference_list.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/pdf_functions.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/prompt_tune.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/summarize.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/text_functions.py +0 -0
- {cat_stack-1.6.8 → cat_stack-1.6.9}/src/catstack/text_functions_ensemble.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cat-stack
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.9
|
|
4
4
|
Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
|
|
5
5
|
Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
|
|
6
6
|
Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
|
|
2
2
|
#
|
|
3
3
|
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
-
__version__ = "1.6.
|
|
4
|
+
__version__ = "1.6.9"
|
|
5
5
|
__author__ = "Chris Soria"
|
|
6
6
|
__email__ = "chrissoria@berkeley.edu"
|
|
7
7
|
__title__ = "cat-stack"
|
|
@@ -45,6 +45,11 @@ def _check_dependencies():
|
|
|
45
45
|
def _check_dependencies_installed() -> bool:
|
|
46
46
|
"""Pure check — returns True if all formatter deps import successfully.
|
|
47
47
|
No side effects, no install attempts."""
|
|
48
|
+
# If a dep was just pip-installed in this process's lifetime, the import
|
|
49
|
+
# system may have cached its earlier absence; clear that so the re-check
|
|
50
|
+
# actually sees the freshly-installed package.
|
|
51
|
+
import importlib
|
|
52
|
+
importlib.invalidate_caches()
|
|
48
53
|
try:
|
|
49
54
|
import torch # noqa: F401
|
|
50
55
|
import transformers # noqa: F401
|
|
@@ -165,7 +170,31 @@ def _ensure_dependencies(verbose: bool = True) -> bool:
|
|
|
165
170
|
" To skip this and disable the formatter, pass json_formatter=False."
|
|
166
171
|
)
|
|
167
172
|
|
|
168
|
-
|
|
173
|
+
ok = _install_dependencies(verbose=verbose)
|
|
174
|
+
if not ok:
|
|
175
|
+
# Freshly pip-installed packages (esp. compiled ones like torch) often
|
|
176
|
+
# cannot be imported by the SAME running process — but they ARE on disk
|
|
177
|
+
# now. Tell the user to re-run rather than silently degrading every row
|
|
178
|
+
# to an error.
|
|
179
|
+
if verbose and _deps_on_disk():
|
|
180
|
+
print(
|
|
181
|
+
"[CatLLM] Formatter dependencies were just installed but cannot "
|
|
182
|
+
"be imported into the already-running process. Please RE-RUN your "
|
|
183
|
+
"command — they will load on the next start. (Avoid this by "
|
|
184
|
+
"pre-installing: pip install 'cat-stack[formatter]'.)"
|
|
185
|
+
)
|
|
186
|
+
return ok
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _deps_on_disk() -> bool:
|
|
190
|
+
"""True if the formatter deps are findable on disk (importable in a FRESH
|
|
191
|
+
process) even if they failed to import in the current one."""
|
|
192
|
+
import importlib.util
|
|
193
|
+
try:
|
|
194
|
+
return all(importlib.util.find_spec(m) is not None
|
|
195
|
+
for m in ("torch", "transformers", "accelerate"))
|
|
196
|
+
except (ImportError, ValueError):
|
|
197
|
+
return False
|
|
169
198
|
|
|
170
199
|
|
|
171
200
|
def _is_model_cached() -> bool:
|
|
@@ -205,6 +234,51 @@ def ensure_formatter_available() -> bool:
|
|
|
205
234
|
return True # actual download happens in load_formatter()
|
|
206
235
|
|
|
207
236
|
|
|
237
|
+
def _load_formatter_tokenizer(AutoTokenizer):
|
|
238
|
+
"""Load the formatter tokenizer, defending against a malformed
|
|
239
|
+
`tokenizer_config.json`.
|
|
240
|
+
|
|
241
|
+
Some published configs store `extra_special_tokens` as a LIST, but
|
|
242
|
+
transformers 4.56–4.57.x expect a dict and crash in
|
|
243
|
+
`_set_model_specific_special_tokens` with
|
|
244
|
+
`'list' object has no attribute 'keys'`. On that failure we snapshot the
|
|
245
|
+
repo locally, normalize a list-valued `extra_special_tokens` to `{}`
|
|
246
|
+
(the tokens already live in `added_tokens`/`special_tokens_map`, so
|
|
247
|
+
dropping the field is lossless), and load from the patched local copy.
|
|
248
|
+
"""
|
|
249
|
+
try:
|
|
250
|
+
return AutoTokenizer.from_pretrained(
|
|
251
|
+
_MERGED_MODEL_REPO, trust_remote_code=True
|
|
252
|
+
)
|
|
253
|
+
except (AttributeError, TypeError) as e:
|
|
254
|
+
if "keys" not in str(e) and "extra_special_tokens" not in str(e):
|
|
255
|
+
raise
|
|
256
|
+
import json
|
|
257
|
+
import os
|
|
258
|
+
from huggingface_hub import snapshot_download
|
|
259
|
+
|
|
260
|
+
local_dir = snapshot_download(_MERGED_MODEL_REPO)
|
|
261
|
+
cfg_path = os.path.join(local_dir, "tokenizer_config.json")
|
|
262
|
+
with open(cfg_path) as f:
|
|
263
|
+
cfg = json.load(f)
|
|
264
|
+
if isinstance(cfg.get("extra_special_tokens"), list):
|
|
265
|
+
cfg["extra_special_tokens"] = {}
|
|
266
|
+
# snapshot dirs are often read-only symlink caches; patch a copy.
|
|
267
|
+
import tempfile
|
|
268
|
+
import shutil
|
|
269
|
+
patched = tempfile.mkdtemp(prefix="catllm_formatter_tok_")
|
|
270
|
+
for fn in os.listdir(local_dir):
|
|
271
|
+
src = os.path.join(local_dir, fn)
|
|
272
|
+
if os.path.isfile(src):
|
|
273
|
+
shutil.copy(src, os.path.join(patched, fn))
|
|
274
|
+
with open(os.path.join(patched, "tokenizer_config.json"), "w") as f:
|
|
275
|
+
json.dump(cfg, f)
|
|
276
|
+
print("[CatLLM] Patched malformed extra_special_tokens in the "
|
|
277
|
+
"formatter tokenizer config (list -> {}).")
|
|
278
|
+
return AutoTokenizer.from_pretrained(patched, trust_remote_code=True)
|
|
279
|
+
raise
|
|
280
|
+
|
|
281
|
+
|
|
208
282
|
def load_formatter(device=None):
|
|
209
283
|
"""
|
|
210
284
|
Load the merged formatter model and tokenizer.
|
|
@@ -230,15 +304,21 @@ def load_formatter(device=None):
|
|
|
230
304
|
dtype = torch.float16 if device == "cuda" else torch.float32
|
|
231
305
|
|
|
232
306
|
print(f"[CatLLM] Loading JSON formatter on {device}...")
|
|
233
|
-
tokenizer = AutoTokenizer
|
|
234
|
-
_MERGED_MODEL_REPO, trust_remote_code=True
|
|
235
|
-
)
|
|
307
|
+
tokenizer = _load_formatter_tokenizer(AutoTokenizer)
|
|
236
308
|
if tokenizer.pad_token is None:
|
|
237
309
|
tokenizer.pad_token = tokenizer.eos_token
|
|
238
310
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
311
|
+
# `dtype=` is the transformers >=4.56 kwarg; older versions only accept
|
|
312
|
+
# `torch_dtype=` and crash if `dtype=` leaks into the config. Try the new
|
|
313
|
+
# name, fall back to the old one.
|
|
314
|
+
try:
|
|
315
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
316
|
+
_MERGED_MODEL_REPO, dtype=dtype, trust_remote_code=True
|
|
317
|
+
)
|
|
318
|
+
except TypeError:
|
|
319
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
320
|
+
_MERGED_MODEL_REPO, torch_dtype=dtype, trust_remote_code=True
|
|
321
|
+
)
|
|
242
322
|
model = model.to(device)
|
|
243
323
|
model.eval()
|
|
244
324
|
|
|
@@ -281,7 +361,9 @@ def run_formatter(raw_output, categories, model, tokenizer, device):
|
|
|
281
361
|
with torch.no_grad():
|
|
282
362
|
out = model.generate(
|
|
283
363
|
**inputs,
|
|
284
|
-
|
|
364
|
+
# 512 (was 128): a large category set produces a long N-key JSON
|
|
365
|
+
# object; 128 tokens truncated it for 28/48-category tasks.
|
|
366
|
+
max_new_tokens=512,
|
|
285
367
|
do_sample=False,
|
|
286
368
|
temperature=None,
|
|
287
369
|
top_p=None,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|