cat-stack 1.2.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {cat_stack-1.2.0 → cat_stack-1.4.0}/PKG-INFO +1 -1
  2. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/__about__.py +1 -1
  3. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/__init__.py +2 -0
  4. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_wrapper_helpers.py +98 -0
  5. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/prompt_tune.py +8 -8
  6. {cat_stack-1.2.0 → cat_stack-1.4.0}/.gitignore +0 -0
  7. {cat_stack-1.2.0 → cat_stack-1.4.0}/LICENSE +0 -0
  8. {cat_stack-1.2.0 → cat_stack-1.4.0}/README.md +0 -0
  9. {cat_stack-1.2.0 → cat_stack-1.4.0}/pyproject.toml +0 -0
  10. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/cat_stack/__init__.py +0 -0
  11. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_batch.py +0 -0
  12. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_category_analysis.py +0 -0
  13. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_chunked.py +0 -0
  14. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_embeddings.py +0 -0
  15. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_formatter.py +0 -0
  16. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_pilot_test.py +0 -0
  17. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_prompts.py +0 -0
  18. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_providers.py +0 -0
  19. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_review_ui.py +0 -0
  20. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_tiebreaker.py +0 -0
  21. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_utils.py +0 -0
  22. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/_web_fetch.py +0 -0
  23. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/calls/CoVe.py +0 -0
  24. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/calls/__init__.py +0 -0
  25. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/calls/all_calls.py +0 -0
  26. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/calls/image_CoVe.py +0 -0
  27. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/calls/image_stepback.py +0 -0
  28. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/calls/pdf_CoVe.py +0 -0
  29. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/calls/pdf_stepback.py +0 -0
  30. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/calls/stepback.py +0 -0
  31. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/calls/top_n.py +0 -0
  32. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/classify.py +0 -0
  33. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/explore.py +0 -0
  34. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/extract.py +0 -0
  35. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/image_functions.py +0 -0
  36. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/images/circle.png +0 -0
  37. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/images/cube.png +0 -0
  38. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/images/diamond.png +0 -0
  39. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/images/overlapping_pentagons.png +0 -0
  40. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/images/rectangles.png +0 -0
  41. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/model_reference_list.py +0 -0
  42. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/pdf_functions.py +0 -0
  43. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/summarize.py +0 -0
  44. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/text_functions.py +0 -0
  45. {cat_stack-1.2.0 → cat_stack-1.4.0}/src/catstack/text_functions_ensemble.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cat-stack
3
- Version: 1.2.0
3
+ Version: 1.4.0
4
4
  Summary: Domain-agnostic text, image, PDF, and DOCX classification engine powered by LLMs
5
5
  Project-URL: Documentation, https://github.com/chrissoria/cat-stack#readme
6
6
  Project-URL: Issues, https://github.com/chrissoria/cat-stack/issues
@@ -1,7 +1,7 @@
1
1
  # SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
2
2
  #
3
3
  # SPDX-License-Identifier: GPL-3.0-or-later
4
- __version__ = "1.2.0"
4
+ __version__ = "1.4.0"
5
5
  __author__ = "Chris Soria"
6
6
  __email__ = "chrissoria@berkeley.edu"
7
7
  __title__ = "cat-stack"
@@ -92,6 +92,7 @@ from ._wrapper_helpers import (
92
92
  parse_models_string,
93
93
  short_label,
94
94
  classify_labels,
95
+ classify_indicators,
95
96
  )
96
97
 
97
98
  # Define public API
@@ -144,4 +145,5 @@ __all__ = [
144
145
  "parse_models_string",
145
146
  "short_label",
146
147
  "classify_labels",
148
+ "classify_indicators",
147
149
  ]
@@ -328,3 +328,101 @@ def classify_labels(
328
328
  if return_full:
329
329
  return labels_per_row, df
330
330
  return labels_per_row
331
+
332
+
333
+ def classify_indicators(
334
+ input_data,
335
+ categories,
336
+ *,
337
+ short_labels: bool = True,
338
+ return_full: bool = False,
339
+ **kwargs,
340
+ ):
341
+ """Convenience wrapper around `classify()` returning per-category indicators.
342
+
343
+ Like `classify_labels`, but instead of collapsing the wide DataFrame to
344
+ one assigned label per row, it returns a dict mapping each category to
345
+ a list of 0/1 indicators of length `len(input_data)`.
346
+
347
+ This is the right shape for language wrappers that want one indicator
348
+ variable per category (Stata's wide mode, future R `as_indicators=TRUE`
349
+ mode) instead of a single label per row.
350
+
351
+ Args:
352
+ input_data: Same as `classify()`.
353
+ categories: Same as `classify()` — list of category strings.
354
+ short_labels: If True (default), use `short_label()` on each
355
+ category to produce dict keys (`"Positive: defn"` → `"Positive"`).
356
+ If False, the dict keys are the full category strings.
357
+ return_full: If True, return `(indicators_dict, df)` so callers also
358
+ have access to the underlying DataFrame. Default False.
359
+ **kwargs: All other kwargs are forwarded to `classify()`.
360
+
361
+ Returns:
362
+ dict[str, list[int]]: keys are category labels (short or full),
363
+ values are 0/1 lists of length `len(input_data)`. In ensemble mode
364
+ the indicators come from the `category_N_consensus` columns; in
365
+ single-model mode from `category_N`.
366
+ Or `(dict, df)` tuple if `return_full=True`.
367
+
368
+ Raises:
369
+ RuntimeError: if `classify()` returns a DataFrame that contains
370
+ neither `category_N` nor `category_N_consensus` columns
371
+ (centralized schema canary, same trigger as `classify_labels`).
372
+
373
+ Example:
374
+ >>> indicators = classify_indicators(
375
+ ... ["I moved for the job and to be near family.",
376
+ ... "Lower cost of living was the only reason."],
377
+ ... ["Job: career", "Family: relationships", "Cost: affordability"],
378
+ ... api_key="...", user_model="gpt-4o-mini",
379
+ ... )
380
+ >>> indicators
381
+ {'Job': [1, 0], 'Family': [1, 0], 'Cost': [0, 1]}
382
+ """
383
+ # Reuse classify_labels for the df + centralized schema canary. We
384
+ # pass short_labels=False because we want the raw df; we apply our own
385
+ # short_label() to the dict keys below.
386
+ _labels, df = classify_labels(
387
+ input_data,
388
+ categories,
389
+ short_labels=False,
390
+ return_full=True,
391
+ **kwargs,
392
+ )
393
+
394
+ cols = list(df.columns)
395
+ indexed: List[Tuple[int, str]] = []
396
+ for c in cols:
397
+ m = _CONSENSUS_COL_PAT.match(c)
398
+ if m:
399
+ indexed.append((int(m.group(1)), c))
400
+ if not indexed:
401
+ for c in cols:
402
+ m = _SINGLE_COL_PAT.match(c)
403
+ if m:
404
+ indexed.append((int(m.group(1)), c))
405
+ # classify_labels already raised RuntimeError if neither family is
406
+ # present, so we know `indexed` is non-empty here.
407
+ indexed.sort(key=lambda t: t[0])
408
+
409
+ keys = [short_label(c) if short_labels else c for c in categories]
410
+
411
+ out: Dict[str, List[int]] = {}
412
+ for n, col in indexed:
413
+ cat_idx = n - 1
414
+ if not (0 <= cat_idx < len(keys)):
415
+ continue
416
+ key = str(keys[cat_idx])
417
+ series = df[col]
418
+ values: List[int] = []
419
+ for v in series:
420
+ try:
421
+ values.append(1 if int(v) == 1 else 0)
422
+ except (ValueError, TypeError):
423
+ values.append(0)
424
+ out[key] = values
425
+
426
+ if return_full:
427
+ return out, df
428
+ return out
@@ -749,17 +749,17 @@ def _generate_category_instruction(
749
749
  # Current instruction
750
750
  current_text = f'\nCURRENT INSTRUCTION FOR THIS CATEGORY:\n"{current_instruction}"\n' if current_instruction else ""
751
751
 
752
- # History of previous attempts for this category so the meta-LLM doesn't repeat itself
752
+ # History of previous attempts capped at last 3 to avoid prompt bloat.
753
+ # Format is deliberately simple (no score numbers) so smaller models can follow it.
753
754
  history_text = ""
754
755
  if attempt_history:
755
- history_lines = []
756
- for i, h in enumerate(attempt_history, 1):
757
- history_lines.append(
758
- f' Attempt {i}: "{h["instruction"]}"'
759
- f' → {h["outcome"]} (holdout score {h["score_before"]:.2f}→{h["score_after"]:.2f})'
760
- )
756
+ recent = attempt_history[-3:]
757
+ history_lines = [
758
+ f' - "{h["instruction"]}" [{h["outcome"]}]'
759
+ for h in recent
760
+ ]
761
761
  history_text = (
762
- "\nPREVIOUS ATTEMPTS FOR THIS CATEGORY — do not repeat these:\n"
762
+ "\nPREVIOUS INSTRUCTIONS TRIED FOR THIS CATEGORY (already tested write something different):\n"
763
763
  + "\n".join(history_lines)
764
764
  + "\n"
765
765
  )
File without changes
File without changes
File without changes
File without changes