cat-web 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cat_web-0.1.0.dist-info/METADATA +163 -0
- cat_web-0.1.0.dist-info/RECORD +10 -0
- cat_web-0.1.0.dist-info/WHEEL +4 -0
- catweb/__about__.py +10 -0
- catweb/__init__.py +113 -0
- catweb/_web_fetch.py +22 -0
- catweb/classify.py +184 -0
- catweb/explore.py +114 -0
- catweb/extract.py +112 -0
- catweb/summarize.py +118 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cat-web
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLM-powered classification and extraction for web content
|
|
5
|
+
Project-URL: Documentation, https://github.com/chrissoria/cat-web#readme
|
|
6
|
+
Project-URL: Issues, https://github.com/chrissoria/cat-web/issues
|
|
7
|
+
Project-URL: Source, https://github.com/chrissoria/cat-web
|
|
8
|
+
Author-email: Chris Soria <chrissoria@berkeley.edu>
|
|
9
|
+
License-Expression: GPL-3.0-or-later
|
|
10
|
+
Keywords: llm,structured output,text classification,url classification,web content
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Programming Language :: Python
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
19
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
20
|
+
Requires-Python: >=3.8
|
|
21
|
+
Requires-Dist: cat-stack>=0.1.0
|
|
22
|
+
Requires-Dist: pandas
|
|
23
|
+
Requires-Dist: requests
|
|
24
|
+
Requires-Dist: tqdm
|
|
25
|
+
Provides-Extra: pdf
|
|
26
|
+
Requires-Dist: cat-stack[pdf]; extra == 'pdf'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# CatWeb
|
|
30
|
+
|
|
31
|
+
LLM-powered classification, extraction, and summarization for web content.
|
|
32
|
+
|
|
33
|
+
Part of the [CatLLM ecosystem](https://github.com/chrissoria/cat-llm). Thin wrapper around [cat-stack](https://github.com/chrissoria/cat-stack) that adds URL fetching and web-specific context injection.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install cat-web # pulls in cat-stack automatically
|
|
39
|
+
pip install cat-web[pdf] # with PDF support
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Quick Start
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
import catweb as cat
|
|
46
|
+
|
|
47
|
+
# Classify web pages by topic
|
|
48
|
+
results = cat.classify(
|
|
49
|
+
categories=["News", "Opinion", "Tutorial", "Reference"],
|
|
50
|
+
input_data=[
|
|
51
|
+
"https://example.com/article1",
|
|
52
|
+
"https://example.com/article2",
|
|
53
|
+
],
|
|
54
|
+
api_key="your-api-key",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Extract categories from web content
|
|
58
|
+
extracted = cat.extract(
|
|
59
|
+
input_data=["https://example.com/page1", "https://example.com/page2"],
|
|
60
|
+
description="Blog posts about technology",
|
|
61
|
+
api_key="your-api-key",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Summarize web pages
|
|
65
|
+
summaries = cat.summarize(
|
|
66
|
+
input_data=["https://example.com/article1"],
|
|
67
|
+
description="News articles",
|
|
68
|
+
api_key="your-api-key",
|
|
69
|
+
)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## How It Works
|
|
73
|
+
|
|
74
|
+
CatWeb accepts URLs as input, fetches the web content, strips HTML to plain text, and passes the text through cat-stack's classification/extraction/summarization pipeline. Original URLs are preserved in the output DataFrame's `survey_input` column.
|
|
75
|
+
|
|
76
|
+
You can also pass pre-fetched text directly — CatWeb auto-detects whether input is URLs or plain text.
|
|
77
|
+
|
|
78
|
+
## API Reference
|
|
79
|
+
|
|
80
|
+
### `classify(categories, input_data, api_key, ...)`
|
|
81
|
+
|
|
82
|
+
Classify web content into predefined categories.
|
|
83
|
+
|
|
84
|
+
| Parameter | Type | Description |
|
|
85
|
+
|-----------|------|-------------|
|
|
86
|
+
| `categories` | list | Category names for classification |
|
|
87
|
+
| `input_data` | list/Series | URLs or text strings to classify |
|
|
88
|
+
| `api_key` | str | API key for the model provider |
|
|
89
|
+
| `source_domain` | str | Source domain (injected as prompt context) |
|
|
90
|
+
| `content_type` | str | Content type, e.g. "news article", "blog post" |
|
|
91
|
+
| `web_metadata` | dict | Additional key-value context for the prompt |
|
|
92
|
+
| `timeout` | int | URL fetch timeout in seconds (default 30) |
|
|
93
|
+
| `**kwargs` | | All cat-stack classify() parameters (models, creativity, batch_mode, etc.) |
|
|
94
|
+
|
|
95
|
+
### `extract(input_data, api_key, ...)`
|
|
96
|
+
|
|
97
|
+
Discover categories from web content.
|
|
98
|
+
|
|
99
|
+
| Parameter | Type | Description |
|
|
100
|
+
|-----------|------|-------------|
|
|
101
|
+
| `input_data` | list/Series | URLs or text strings |
|
|
102
|
+
| `api_key` | str | API key |
|
|
103
|
+
| `source_domain` | str | Source domain context |
|
|
104
|
+
| `content_type` | str | Content type context |
|
|
105
|
+
| `web_metadata` | dict | Additional context |
|
|
106
|
+
| `timeout` | int | URL fetch timeout (default 30) |
|
|
107
|
+
| `**kwargs` | | All cat-stack extract() parameters |
|
|
108
|
+
|
|
109
|
+
### `explore(input_data, api_key, ...)`
|
|
110
|
+
|
|
111
|
+
Raw category extraction (with duplicates) for saturation analysis.
|
|
112
|
+
|
|
113
|
+
Same parameters as `extract()`, plus all cat-stack `explore()` parameters.
|
|
114
|
+
|
|
115
|
+
### `summarize(input_data, ...)`
|
|
116
|
+
|
|
117
|
+
Summarize web content.
|
|
118
|
+
|
|
119
|
+
| Parameter | Type | Description |
|
|
120
|
+
|-----------|------|-------------|
|
|
121
|
+
| `input_data` | list/Series | URLs or text strings |
|
|
122
|
+
| `source_domain` | str | Source domain context |
|
|
123
|
+
| `content_type` | str | Content type context |
|
|
124
|
+
| `web_metadata` | dict | Additional context |
|
|
125
|
+
| `timeout` | int | URL fetch timeout (default 30) |
|
|
126
|
+
| `**kwargs` | | All cat-stack summarize() parameters (api_key, description, models, etc.) |
|
|
127
|
+
|
|
128
|
+
### Web Utilities
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
from catweb import is_url, fetch_url_text, fetch_urls
|
|
132
|
+
|
|
133
|
+
# Check if a string is a URL
|
|
134
|
+
is_url("https://example.com") # True
|
|
135
|
+
is_url("just text") # False
|
|
136
|
+
|
|
137
|
+
# Fetch a single URL
|
|
138
|
+
text, error = fetch_url_text("https://example.com")
|
|
139
|
+
|
|
140
|
+
# Fetch multiple URLs
|
|
141
|
+
results = fetch_urls(["https://a.com", "https://b.com"])
|
|
142
|
+
# Returns: [(url, text, error), ...]
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Multi-Model Ensemble
|
|
146
|
+
|
|
147
|
+
All cat-stack ensemble features work through `**kwargs`:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
results = cat.classify(
|
|
151
|
+
categories=["Positive", "Negative", "Neutral"],
|
|
152
|
+
input_data=urls,
|
|
153
|
+
models=[
|
|
154
|
+
("gpt-4o", "openai", "sk-..."),
|
|
155
|
+
("claude-sonnet-4-5-20250929", "anthropic", "sk-ant-..."),
|
|
156
|
+
],
|
|
157
|
+
consensus_threshold="majority",
|
|
158
|
+
)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## License
|
|
162
|
+
|
|
163
|
+
GPL-3.0-or-later
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
catweb/__about__.py,sha256=0QOx69kyMOxznoYuYoBJvqInc9Yo9mtwrFDC1PMYELg,368
|
|
2
|
+
catweb/__init__.py,sha256=_EzIU2RoyRRejLj9kuIzmuaJd6u1ImIFqPFbi5N6C5g,3075
|
|
3
|
+
catweb/_web_fetch.py,sha256=MSB9VUJm4A0sd089-lHQRIBUa7E3oY97trSfTZNIRQA,424
|
|
4
|
+
catweb/classify.py,sha256=2VOF1yGSsoA9M-B0-MNDeT1AxoaFh9sLDprj1cFAC-Y,6730
|
|
5
|
+
catweb/explore.py,sha256=7RBtUAWAmeA_a5tX8KaAU-A_TZ-8sG7bvnJmDd65_oQ,3880
|
|
6
|
+
catweb/extract.py,sha256=T-7jST1OTKYPnyu70-dJJOvWLPAnZbZXrq_x9qW3YNI,3812
|
|
7
|
+
catweb/summarize.py,sha256=jdlO5N5EdbV08ja0O2--IQIF154XafymOuNFtmdzdEI,3916
|
|
8
|
+
cat_web-0.1.0.dist-info/METADATA,sha256=Nk3lVgu7clOqe-wre8-4-0CwVPPXCJsbMN8L87t9mXg,5345
|
|
9
|
+
cat_web-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
10
|
+
cat_web-0.1.0.dist-info/RECORD,,
|
catweb/__about__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
+
|
|
5
|
+
__version__ = "0.1.0"
|
|
6
|
+
__author__ = "Christopher Soria"
|
|
7
|
+
__description__ = "LLM-powered classification and extraction for web content"
|
|
8
|
+
__title__ = "cat-web"
|
|
9
|
+
__url__ = "https://github.com/chrissoria/cat-web"
|
|
10
|
+
__license__ = "GPL-3.0-or-later"
|
catweb/__init__.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: 2025-present Christopher Soria <chrissoria@berkeley.edu>
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
4
|
+
|
|
5
|
+
from .__about__ import (
|
|
6
|
+
__version__,
|
|
7
|
+
__author__,
|
|
8
|
+
__description__,
|
|
9
|
+
__title__,
|
|
10
|
+
__url__,
|
|
11
|
+
__license__,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
# =============================================================================
|
|
15
|
+
# Public API — catweb entry points (thin wrappers around cat_stack)
|
|
16
|
+
# =============================================================================
|
|
17
|
+
from .classify import classify
|
|
18
|
+
from .extract import extract
|
|
19
|
+
from .explore import explore
|
|
20
|
+
from .summarize import summarize
|
|
21
|
+
|
|
22
|
+
# =============================================================================
|
|
23
|
+
# Web fetching utilities (catweb-specific)
|
|
24
|
+
# =============================================================================
|
|
25
|
+
from ._web_fetch import (
|
|
26
|
+
is_url,
|
|
27
|
+
fetch_url_text,
|
|
28
|
+
fetch_urls,
|
|
29
|
+
detect_url_input,
|
|
30
|
+
strip_html_tags,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# =============================================================================
|
|
34
|
+
# Re-exports from cat_stack (backward compatibility + provider utilities)
|
|
35
|
+
# =============================================================================
|
|
36
|
+
from cat_stack import (
|
|
37
|
+
# Category analysis
|
|
38
|
+
has_other_category,
|
|
39
|
+
check_category_verbosity,
|
|
40
|
+
# Batch exceptions
|
|
41
|
+
BatchJobExpiredError,
|
|
42
|
+
BatchJobFailedError,
|
|
43
|
+
# Provider utilities
|
|
44
|
+
UnifiedLLMClient,
|
|
45
|
+
detect_provider,
|
|
46
|
+
set_ollama_endpoint,
|
|
47
|
+
check_ollama_running,
|
|
48
|
+
list_ollama_models,
|
|
49
|
+
check_ollama_model,
|
|
50
|
+
pull_ollama_model,
|
|
51
|
+
PROVIDER_CONFIG,
|
|
52
|
+
# Deprecated backward-compat functions
|
|
53
|
+
explore_common_categories,
|
|
54
|
+
explore_corpus,
|
|
55
|
+
explore_image_categories,
|
|
56
|
+
explore_pdf_categories,
|
|
57
|
+
classify_ensemble,
|
|
58
|
+
multi_class,
|
|
59
|
+
image_multi_class,
|
|
60
|
+
pdf_multi_class,
|
|
61
|
+
summarize_ensemble,
|
|
62
|
+
# Utilities
|
|
63
|
+
build_json_schema,
|
|
64
|
+
extract_json,
|
|
65
|
+
validate_classification_json,
|
|
66
|
+
image_score_drawing,
|
|
67
|
+
image_features,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Define public API
|
|
71
|
+
__all__ = [
|
|
72
|
+
# Main entry points (catweb wrappers)
|
|
73
|
+
"classify",
|
|
74
|
+
"extract",
|
|
75
|
+
"explore",
|
|
76
|
+
"summarize",
|
|
77
|
+
# Web fetching utilities
|
|
78
|
+
"is_url",
|
|
79
|
+
"fetch_url_text",
|
|
80
|
+
"fetch_urls",
|
|
81
|
+
"detect_url_input",
|
|
82
|
+
"strip_html_tags",
|
|
83
|
+
# Category analysis (from cat_stack)
|
|
84
|
+
"has_other_category",
|
|
85
|
+
"check_category_verbosity",
|
|
86
|
+
# Batch exceptions (from cat_stack)
|
|
87
|
+
"BatchJobExpiredError",
|
|
88
|
+
"BatchJobFailedError",
|
|
89
|
+
# Provider utilities (from cat_stack)
|
|
90
|
+
"UnifiedLLMClient",
|
|
91
|
+
"detect_provider",
|
|
92
|
+
"set_ollama_endpoint",
|
|
93
|
+
"check_ollama_running",
|
|
94
|
+
"list_ollama_models",
|
|
95
|
+
"check_ollama_model",
|
|
96
|
+
"pull_ollama_model",
|
|
97
|
+
"PROVIDER_CONFIG",
|
|
98
|
+
# Deprecated backward-compat (from cat_stack)
|
|
99
|
+
"explore_common_categories",
|
|
100
|
+
"explore_corpus",
|
|
101
|
+
"explore_image_categories",
|
|
102
|
+
"explore_pdf_categories",
|
|
103
|
+
"classify_ensemble",
|
|
104
|
+
"summarize_ensemble",
|
|
105
|
+
"multi_class",
|
|
106
|
+
"image_multi_class",
|
|
107
|
+
"pdf_multi_class",
|
|
108
|
+
"image_score_drawing",
|
|
109
|
+
"image_features",
|
|
110
|
+
"build_json_schema",
|
|
111
|
+
"extract_json",
|
|
112
|
+
"validate_classification_json",
|
|
113
|
+
]
|
catweb/_web_fetch.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Re-export web fetching utilities from cat_stack.
|
|
3
|
+
|
|
4
|
+
These functions now live in cat_stack._web_fetch. This module re-exports
|
|
5
|
+
them for backward compatibility within cat-web.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from cat_stack._web_fetch import (
|
|
9
|
+
is_url,
|
|
10
|
+
fetch_url_text,
|
|
11
|
+
fetch_urls,
|
|
12
|
+
detect_url_input,
|
|
13
|
+
strip_html_tags,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"is_url",
|
|
18
|
+
"fetch_url_text",
|
|
19
|
+
"fetch_urls",
|
|
20
|
+
"detect_url_input",
|
|
21
|
+
"strip_html_tags",
|
|
22
|
+
]
|
catweb/classify.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Classification functions for CatWeb.
|
|
3
|
+
|
|
4
|
+
Thin wrapper around cat_stack.classify() that adds web-specific features:
|
|
5
|
+
- URL fetching (accepts list of URLs as input_data)
|
|
6
|
+
- Web context injection (source_domain, content_type)
|
|
7
|
+
- Post-classification URL preservation (original URLs in input_data column)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import cat_stack
|
|
11
|
+
|
|
12
|
+
from ._web_fetch import fetch_urls, is_url
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"classify",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _normalize_url_list(input_data):
|
|
20
|
+
"""Normalize input_data to a list of URL strings."""
|
|
21
|
+
if isinstance(input_data, str):
|
|
22
|
+
return [input_data]
|
|
23
|
+
elif hasattr(input_data, 'tolist'):
|
|
24
|
+
return input_data.tolist()
|
|
25
|
+
else:
|
|
26
|
+
return list(input_data)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _build_web_context(source_domain, content_type, web_metadata):
|
|
30
|
+
"""Build a context block string from web content metadata fields."""
|
|
31
|
+
parts = []
|
|
32
|
+
if source_domain:
|
|
33
|
+
parts.append(f"Source domain: {source_domain}")
|
|
34
|
+
if content_type:
|
|
35
|
+
parts.append(f"Content type: {content_type}")
|
|
36
|
+
if web_metadata:
|
|
37
|
+
for k, v in web_metadata.items():
|
|
38
|
+
parts.append(f"{k.capitalize()}: {v}")
|
|
39
|
+
return "\n".join(parts)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def classify(
|
|
43
|
+
categories,
|
|
44
|
+
input_data=None,
|
|
45
|
+
api_key=None,
|
|
46
|
+
# Web context fields — injected into the classification prompt
|
|
47
|
+
source_domain: str = None,
|
|
48
|
+
content_type: str = None,
|
|
49
|
+
web_metadata: dict = None,
|
|
50
|
+
description="",
|
|
51
|
+
filename=None,
|
|
52
|
+
save_directory=None,
|
|
53
|
+
timeout: int = 30,
|
|
54
|
+
**kwargs,
|
|
55
|
+
):
|
|
56
|
+
"""
|
|
57
|
+
Classify web content (URLs) with web-specific features.
|
|
58
|
+
|
|
59
|
+
Wraps cat_stack.classify() and adds:
|
|
60
|
+
- Automatic URL fetching (pass a list of URLs as input_data)
|
|
61
|
+
- Web context injection into the classification prompt
|
|
62
|
+
- Original URLs preserved in the input_data column
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
categories (list): List of category names for classification.
|
|
66
|
+
input_data: The data to classify. Can be:
|
|
67
|
+
- List of URLs (auto-fetched and classified as text)
|
|
68
|
+
- List of text strings (classified directly, no fetching)
|
|
69
|
+
- pandas Series of URLs or text
|
|
70
|
+
api_key (str): API key for the model provider (single-model mode).
|
|
71
|
+
source_domain (str): Source domain — injected into the prompt as context.
|
|
72
|
+
content_type (str): Content type (e.g., "news article", "blog post").
|
|
73
|
+
web_metadata (dict): Additional context injected into the prompt.
|
|
74
|
+
description (str): Description of the input data context.
|
|
75
|
+
filename (str): Output filename for CSV.
|
|
76
|
+
save_directory (str): Directory to save results.
|
|
77
|
+
timeout (int): Timeout in seconds for URL fetching. Default 30.
|
|
78
|
+
**kwargs: All other parameters passed through to cat_stack.classify()
|
|
79
|
+
(e.g. user_model, models, creativity, batch_mode, consensus_threshold,
|
|
80
|
+
chain_of_thought, thinking_budget, embeddings, etc.)
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
pd.DataFrame: Results with classification columns. When URLs are
|
|
84
|
+
provided, the input_data column contains the original URLs
|
|
85
|
+
(not the fetched content).
|
|
86
|
+
|
|
87
|
+
Examples:
|
|
88
|
+
>>> import catweb as cat
|
|
89
|
+
>>>
|
|
90
|
+
>>> # Classify web pages by topic
|
|
91
|
+
>>> results = cat.classify(
|
|
92
|
+
... categories=["News", "Opinion", "Tutorial", "Reference"],
|
|
93
|
+
... input_data=["https://example.com/article1", "https://example.com/article2"],
|
|
94
|
+
... api_key="your-api-key",
|
|
95
|
+
... )
|
|
96
|
+
>>>
|
|
97
|
+
>>> # Classify with web context
|
|
98
|
+
>>> results = cat.classify(
|
|
99
|
+
... input_data=urls,
|
|
100
|
+
... categories=["Positive", "Negative", "Neutral"],
|
|
101
|
+
... source_domain="news.ycombinator.com",
|
|
102
|
+
... content_type="forum discussion",
|
|
103
|
+
... api_key="your-api-key",
|
|
104
|
+
... )
|
|
105
|
+
>>>
|
|
106
|
+
>>> # Classify pre-fetched text (no URL fetching)
|
|
107
|
+
>>> results = cat.classify(
|
|
108
|
+
... input_data=["Some text content...", "More text..."],
|
|
109
|
+
... categories=["Tech", "Science", "Politics"],
|
|
110
|
+
... api_key="your-api-key",
|
|
111
|
+
... )
|
|
112
|
+
"""
|
|
113
|
+
# Early validation
|
|
114
|
+
if api_key is None and kwargs.get("models") is None:
|
|
115
|
+
raise ValueError(
|
|
116
|
+
"[CatWeb] api_key is required. Pass api_key='sk-...' or use the "
|
|
117
|
+
"models= parameter for multi-model mode."
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if input_data is None:
|
|
121
|
+
raise ValueError("[CatWeb] input_data is required.")
|
|
122
|
+
|
|
123
|
+
# Check if input is URLs — fetch content if so
|
|
124
|
+
_url_originals = None
|
|
125
|
+
from ._web_fetch import detect_url_input
|
|
126
|
+
if detect_url_input(input_data):
|
|
127
|
+
url_list = _normalize_url_list(input_data)
|
|
128
|
+
print(f"[CatWeb] Fetching content from {len(url_list)} URLs...")
|
|
129
|
+
url_results = fetch_urls(url_list, timeout=timeout)
|
|
130
|
+
|
|
131
|
+
_url_originals = []
|
|
132
|
+
fetched_texts = []
|
|
133
|
+
success_count = 0
|
|
134
|
+
for url, text, error in url_results:
|
|
135
|
+
_url_originals.append(url)
|
|
136
|
+
if error:
|
|
137
|
+
print(f" Warning: {error}")
|
|
138
|
+
fetched_texts.append("")
|
|
139
|
+
else:
|
|
140
|
+
fetched_texts.append(text)
|
|
141
|
+
success_count += 1
|
|
142
|
+
|
|
143
|
+
print(f"[CatWeb] Successfully fetched {success_count}/{len(url_list)} URLs")
|
|
144
|
+
input_data = fetched_texts
|
|
145
|
+
|
|
146
|
+
# Prepend web context to description if any fields provided
|
|
147
|
+
web_context = _build_web_context(source_domain, content_type, web_metadata)
|
|
148
|
+
if web_context:
|
|
149
|
+
description = f"{web_context}\n{description}".strip() if description else web_context
|
|
150
|
+
|
|
151
|
+
# Remove keys we set explicitly to avoid "multiple values" if caller also passes them
|
|
152
|
+
kwargs.pop("add_other", None)
|
|
153
|
+
kwargs.pop("check_verbosity", None)
|
|
154
|
+
|
|
155
|
+
result = cat_stack.classify(
|
|
156
|
+
input_data=input_data,
|
|
157
|
+
categories=categories,
|
|
158
|
+
api_key=api_key,
|
|
159
|
+
description=description,
|
|
160
|
+
add_other=False,
|
|
161
|
+
check_verbosity=False,
|
|
162
|
+
filename=None if _url_originals else filename,
|
|
163
|
+
save_directory=None if _url_originals else save_directory,
|
|
164
|
+
**kwargs,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Replace fetched text with original URLs in input_data column
|
|
168
|
+
if _url_originals is not None:
|
|
169
|
+
result = result.reset_index(drop=True)
|
|
170
|
+
if "input_data" in result.columns:
|
|
171
|
+
result["input_data"] = _url_originals
|
|
172
|
+
# Save with URLs if filename was requested
|
|
173
|
+
if filename or save_directory:
|
|
174
|
+
import os as _os
|
|
175
|
+
out = filename
|
|
176
|
+
if save_directory and filename:
|
|
177
|
+
out = _os.path.join(save_directory, _os.path.basename(filename))
|
|
178
|
+
elif save_directory:
|
|
179
|
+
out = _os.path.join(save_directory, "results.csv")
|
|
180
|
+
if out:
|
|
181
|
+
result.to_csv(out, index=False)
|
|
182
|
+
print(f"Results saved to {out}")
|
|
183
|
+
|
|
184
|
+
return result
|
catweb/explore.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Category exploration functions for CatWeb.
|
|
3
|
+
|
|
4
|
+
Thin wrapper around cat_stack.explore() that adds web-specific features:
|
|
5
|
+
- URL fetching (accepts list of URLs as input_data)
|
|
6
|
+
- Web context injection
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import cat_stack
|
|
10
|
+
|
|
11
|
+
from ._web_fetch import fetch_urls, detect_url_input
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"explore",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def explore(
|
|
19
|
+
input_data=None,
|
|
20
|
+
api_key=None,
|
|
21
|
+
# Web context fields
|
|
22
|
+
source_domain: str = None,
|
|
23
|
+
content_type: str = None,
|
|
24
|
+
web_metadata: dict = None,
|
|
25
|
+
description="",
|
|
26
|
+
timeout: int = 30,
|
|
27
|
+
**kwargs,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Explore categories in web content, returning the raw extracted list.
|
|
31
|
+
|
|
32
|
+
Wraps cat_stack.explore() and adds:
|
|
33
|
+
- Automatic URL fetching (pass a list of URLs as input_data)
|
|
34
|
+
- Web context injection into the exploration prompt
|
|
35
|
+
|
|
36
|
+
Unlike extract(), which normalizes and merges categories, explore()
|
|
37
|
+
returns every category string from every chunk across every iteration
|
|
38
|
+
— with duplicates intact.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
input_data: The data to explore. Can be:
|
|
42
|
+
- List of URLs (auto-fetched and processed as text)
|
|
43
|
+
- List of text strings (processed directly)
|
|
44
|
+
- pandas Series of URLs or text
|
|
45
|
+
api_key (str): API key for the model provider.
|
|
46
|
+
source_domain (str): Source domain — injected into the prompt as context.
|
|
47
|
+
content_type (str): Content type (e.g., "news article", "blog post").
|
|
48
|
+
web_metadata (dict): Additional context injected into the prompt.
|
|
49
|
+
description (str): Description of the input data context.
|
|
50
|
+
timeout (int): Timeout in seconds for URL fetching. Default 30.
|
|
51
|
+
**kwargs: All other parameters passed through to cat_stack.explore()
|
|
52
|
+
(e.g. max_categories, categories_per_chunk, divisions, user_model,
|
|
53
|
+
creativity, specificity, research_question, filename, model_source,
|
|
54
|
+
iterations, random_state, focus, etc.)
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
list[str]: Every category string extracted from every chunk across
|
|
58
|
+
every iteration.
|
|
59
|
+
|
|
60
|
+
Examples:
|
|
61
|
+
>>> import catweb as cat
|
|
62
|
+
>>>
|
|
63
|
+
>>> raw_categories = cat.explore(
|
|
64
|
+
... input_data=["https://example.com/page1", "https://example.com/page2"],
|
|
65
|
+
... description="News articles",
|
|
66
|
+
... api_key="your-api-key",
|
|
67
|
+
... iterations=3,
|
|
68
|
+
... )
|
|
69
|
+
>>> print(raw_categories[:5])
|
|
70
|
+
"""
|
|
71
|
+
if api_key is None:
|
|
72
|
+
raise ValueError(
|
|
73
|
+
"[CatWeb] api_key is required. Pass api_key='sk-...'."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if input_data is None:
|
|
77
|
+
raise ValueError("[CatWeb] input_data is required.")
|
|
78
|
+
|
|
79
|
+
# Check if input is URLs — fetch content if so
|
|
80
|
+
if detect_url_input(input_data):
|
|
81
|
+
if isinstance(input_data, str):
|
|
82
|
+
url_list = [input_data]
|
|
83
|
+
elif hasattr(input_data, 'tolist'):
|
|
84
|
+
url_list = input_data.tolist()
|
|
85
|
+
else:
|
|
86
|
+
url_list = list(input_data)
|
|
87
|
+
|
|
88
|
+
print(f"[CatWeb] Fetching content from {len(url_list)} URLs for exploration...")
|
|
89
|
+
url_results = fetch_urls(url_list, timeout=timeout)
|
|
90
|
+
|
|
91
|
+
input_data = [text if text else "" for _, text, error in url_results]
|
|
92
|
+
success_count = sum(1 for _, _, e in url_results if not e)
|
|
93
|
+
print(f"[CatWeb] Successfully fetched {success_count}/{len(url_list)} URLs")
|
|
94
|
+
|
|
95
|
+
# Build web context
|
|
96
|
+
parts = []
|
|
97
|
+
if source_domain:
|
|
98
|
+
parts.append(f"Source domain: {source_domain}")
|
|
99
|
+
if content_type:
|
|
100
|
+
parts.append(f"Content type: {content_type}")
|
|
101
|
+
if web_metadata:
|
|
102
|
+
for k, v in web_metadata.items():
|
|
103
|
+
parts.append(f"{k.capitalize()}: {v}")
|
|
104
|
+
web_context = "\n".join(parts)
|
|
105
|
+
|
|
106
|
+
if web_context:
|
|
107
|
+
description = f"{web_context}\n{description}".strip() if description else web_context
|
|
108
|
+
|
|
109
|
+
return cat_stack.explore(
|
|
110
|
+
input_data=input_data,
|
|
111
|
+
api_key=api_key,
|
|
112
|
+
description=description,
|
|
113
|
+
**kwargs,
|
|
114
|
+
)
|
catweb/extract.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Category extraction functions for CatWeb.
|
|
3
|
+
|
|
4
|
+
Thin wrapper around cat_stack.extract() that adds web-specific features:
|
|
5
|
+
- URL fetching (accepts list of URLs as input_data)
|
|
6
|
+
- Web context injection
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import cat_stack
|
|
10
|
+
|
|
11
|
+
from ._web_fetch import fetch_urls, detect_url_input
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"extract",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def extract(
|
|
19
|
+
input_data=None,
|
|
20
|
+
api_key=None,
|
|
21
|
+
# Web context fields
|
|
22
|
+
source_domain: str = None,
|
|
23
|
+
content_type: str = None,
|
|
24
|
+
web_metadata: dict = None,
|
|
25
|
+
description="",
|
|
26
|
+
timeout: int = 30,
|
|
27
|
+
**kwargs,
|
|
28
|
+
):
|
|
29
|
+
"""
|
|
30
|
+
Extract categories from web content (URLs or text).
|
|
31
|
+
|
|
32
|
+
Wraps cat_stack.extract() and adds:
|
|
33
|
+
- Automatic URL fetching (pass a list of URLs as input_data)
|
|
34
|
+
- Web context injection into the extraction prompt
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
input_data: The data to explore. Can be:
|
|
38
|
+
- List of URLs (auto-fetched and processed as text)
|
|
39
|
+
- List of text strings (processed directly)
|
|
40
|
+
- pandas Series of URLs or text
|
|
41
|
+
api_key (str): API key for the model provider.
|
|
42
|
+
source_domain (str): Source domain — injected into the prompt as context.
|
|
43
|
+
content_type (str): Content type (e.g., "news article", "blog post").
|
|
44
|
+
web_metadata (dict): Additional context injected into the prompt.
|
|
45
|
+
description (str): Description of the input data context.
|
|
46
|
+
timeout (int): Timeout in seconds for URL fetching. Default 30.
|
|
47
|
+
**kwargs: All other parameters passed through to cat_stack.extract()
|
|
48
|
+
(e.g. max_categories, categories_per_chunk, divisions, user_model,
|
|
49
|
+
creativity, specificity, research_question, filename, model_source,
|
|
50
|
+
iterations, random_state, focus, etc.)
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
dict with keys:
|
|
54
|
+
- counts_df: DataFrame of categories with counts
|
|
55
|
+
- top_categories: List of top category names
|
|
56
|
+
- raw_top_text: Raw model output from final merge step
|
|
57
|
+
|
|
58
|
+
Examples:
|
|
59
|
+
>>> import catweb as cat
|
|
60
|
+
>>>
|
|
61
|
+
>>> # Extract categories from web pages
|
|
62
|
+
>>> results = cat.extract(
|
|
63
|
+
... input_data=["https://example.com/page1", "https://example.com/page2"],
|
|
64
|
+
... description="News articles",
|
|
65
|
+
... api_key="your-api-key",
|
|
66
|
+
... )
|
|
67
|
+
>>> print(results['top_categories'])
|
|
68
|
+
"""
|
|
69
|
+
if api_key is None:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
"[CatWeb] api_key is required. Pass api_key='sk-...'."
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
if input_data is None:
|
|
75
|
+
raise ValueError("[CatWeb] input_data is required.")
|
|
76
|
+
|
|
77
|
+
# Check if input is URLs — fetch content if so
|
|
78
|
+
if detect_url_input(input_data):
|
|
79
|
+
if isinstance(input_data, str):
|
|
80
|
+
url_list = [input_data]
|
|
81
|
+
elif hasattr(input_data, 'tolist'):
|
|
82
|
+
url_list = input_data.tolist()
|
|
83
|
+
else:
|
|
84
|
+
url_list = list(input_data)
|
|
85
|
+
|
|
86
|
+
print(f"[CatWeb] Fetching content from {len(url_list)} URLs for extraction...")
|
|
87
|
+
url_results = fetch_urls(url_list, timeout=timeout)
|
|
88
|
+
|
|
89
|
+
input_data = [text if text else "" for _, text, error in url_results]
|
|
90
|
+
success_count = sum(1 for _, _, e in url_results if not e)
|
|
91
|
+
print(f"[CatWeb] Successfully fetched {success_count}/{len(url_list)} URLs")
|
|
92
|
+
|
|
93
|
+
# Build web context
|
|
94
|
+
parts = []
|
|
95
|
+
if source_domain:
|
|
96
|
+
parts.append(f"Source domain: {source_domain}")
|
|
97
|
+
if content_type:
|
|
98
|
+
parts.append(f"Content type: {content_type}")
|
|
99
|
+
if web_metadata:
|
|
100
|
+
for k, v in web_metadata.items():
|
|
101
|
+
parts.append(f"{k.capitalize()}: {v}")
|
|
102
|
+
web_context = "\n".join(parts)
|
|
103
|
+
|
|
104
|
+
if web_context:
|
|
105
|
+
description = f"{web_context}\n{description}".strip() if description else web_context
|
|
106
|
+
|
|
107
|
+
return cat_stack.extract(
|
|
108
|
+
input_data=input_data,
|
|
109
|
+
api_key=api_key,
|
|
110
|
+
survey_question=description,
|
|
111
|
+
**kwargs,
|
|
112
|
+
)
|
catweb/summarize.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Summarization functions for CatWeb.
|
|
3
|
+
|
|
4
|
+
Thin wrapper around cat_stack.summarize() that adds web-specific features:
|
|
5
|
+
- URL fetching (accepts list of URLs as input_data)
|
|
6
|
+
- Web context injection
|
|
7
|
+
- Original URLs preserved in output
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import cat_stack
|
|
11
|
+
|
|
12
|
+
from ._web_fetch import fetch_urls, detect_url_input
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"summarize",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def summarize(
|
|
20
|
+
input_data=None,
|
|
21
|
+
# Web context fields
|
|
22
|
+
source_domain: str = None,
|
|
23
|
+
content_type: str = None,
|
|
24
|
+
web_metadata: dict = None,
|
|
25
|
+
timeout: int = 30,
|
|
26
|
+
**kwargs,
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
Summarize web content (URLs or text) using LLMs.
|
|
30
|
+
|
|
31
|
+
Wraps cat_stack.summarize() and adds:
|
|
32
|
+
- Automatic URL fetching (pass a list of URLs as input_data)
|
|
33
|
+
- Web context injection into the summarization prompt
|
|
34
|
+
- Original URLs preserved in the input_data column
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
input_data: Data to summarize. Can be:
|
|
38
|
+
- List of URLs (auto-fetched and summarized as text)
|
|
39
|
+
- List of text strings, pandas Series, or single string
|
|
40
|
+
source_domain (str): Source domain — injected into the prompt as context.
|
|
41
|
+
content_type (str): Content type (e.g., "news article", "blog post").
|
|
42
|
+
web_metadata (dict): Additional context injected into the prompt.
|
|
43
|
+
timeout (int): Timeout in seconds for URL fetching. Default 30.
|
|
44
|
+
**kwargs: All parameters passed through to cat_stack.summarize()
|
|
45
|
+
(e.g. api_key, description, instructions, max_length, focus,
|
|
46
|
+
user_model, models, mode, creativity, batch_mode, etc.)
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
pd.DataFrame: Results with summary column(s). When URLs are
|
|
50
|
+
provided, the input_data column contains the original URLs.
|
|
51
|
+
|
|
52
|
+
Examples:
|
|
53
|
+
>>> import catweb as cat
|
|
54
|
+
>>>
|
|
55
|
+
>>> results = cat.summarize(
|
|
56
|
+
... input_data=["https://example.com/article1", "https://example.com/article2"],
|
|
57
|
+
... description="News articles",
|
|
58
|
+
... api_key="your-api-key",
|
|
59
|
+
... )
|
|
60
|
+
"""
|
|
61
|
+
if input_data is None:
|
|
62
|
+
raise ValueError("[CatWeb] input_data is required.")
|
|
63
|
+
|
|
64
|
+
# Check if input is URLs — fetch content if so
|
|
65
|
+
_url_originals = None
|
|
66
|
+
if detect_url_input(input_data):
|
|
67
|
+
if isinstance(input_data, str):
|
|
68
|
+
url_list = [input_data]
|
|
69
|
+
elif hasattr(input_data, 'tolist'):
|
|
70
|
+
url_list = input_data.tolist()
|
|
71
|
+
else:
|
|
72
|
+
url_list = list(input_data)
|
|
73
|
+
|
|
74
|
+
print(f"[CatWeb] Fetching content from {len(url_list)} URLs for summarization...")
|
|
75
|
+
url_results = fetch_urls(url_list, timeout=timeout)
|
|
76
|
+
|
|
77
|
+
_url_originals = []
|
|
78
|
+
fetched_texts = []
|
|
79
|
+
success_count = 0
|
|
80
|
+
for url, text, error in url_results:
|
|
81
|
+
_url_originals.append(url)
|
|
82
|
+
if error:
|
|
83
|
+
print(f" Warning: {error}")
|
|
84
|
+
fetched_texts.append("")
|
|
85
|
+
else:
|
|
86
|
+
fetched_texts.append(text)
|
|
87
|
+
success_count += 1
|
|
88
|
+
|
|
89
|
+
print(f"[CatWeb] Successfully fetched {success_count}/{len(url_list)} URLs")
|
|
90
|
+
input_data = fetched_texts
|
|
91
|
+
|
|
92
|
+
# Build web context and inject into description
|
|
93
|
+
parts = []
|
|
94
|
+
if source_domain:
|
|
95
|
+
parts.append(f"Source domain: {source_domain}")
|
|
96
|
+
if content_type:
|
|
97
|
+
parts.append(f"Content type: {content_type}")
|
|
98
|
+
if web_metadata:
|
|
99
|
+
for k, v in web_metadata.items():
|
|
100
|
+
parts.append(f"{k.capitalize()}: {v}")
|
|
101
|
+
web_context = "\n".join(parts)
|
|
102
|
+
|
|
103
|
+
if web_context:
|
|
104
|
+
desc = kwargs.get("description", "")
|
|
105
|
+
kwargs["description"] = f"{web_context}\n{desc}".strip() if desc else web_context
|
|
106
|
+
|
|
107
|
+
result = cat_stack.summarize(
|
|
108
|
+
input_data=input_data,
|
|
109
|
+
**kwargs,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Replace fetched text with original URLs in input_data column
|
|
113
|
+
if _url_originals is not None:
|
|
114
|
+
result = result.reset_index(drop=True)
|
|
115
|
+
if "input_data" in result.columns:
|
|
116
|
+
result["input_data"] = _url_originals
|
|
117
|
+
|
|
118
|
+
return result
|