parallel-web-tools 0.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parallel_web_tools-0.0.5/.gitignore +21 -0
- parallel_web_tools-0.0.5/PKG-INFO +340 -0
- parallel_web_tools-0.0.5/README.md +277 -0
- parallel_web_tools-0.0.5/parallel_web_tools/__init__.py +56 -0
- parallel_web_tools-0.0.5/parallel_web_tools/cli/__init__.py +5 -0
- parallel_web_tools-0.0.5/parallel_web_tools/cli/commands.py +1103 -0
- parallel_web_tools-0.0.5/parallel_web_tools/cli/planner.py +438 -0
- parallel_web_tools-0.0.5/parallel_web_tools/core/__init__.py +85 -0
- parallel_web_tools-0.0.5/parallel_web_tools/core/auth.py +257 -0
- parallel_web_tools-0.0.5/parallel_web_tools/core/batch.py +280 -0
- parallel_web_tools-0.0.5/parallel_web_tools/core/research.py +288 -0
- parallel_web_tools-0.0.5/parallel_web_tools/core/result.py +29 -0
- parallel_web_tools-0.0.5/parallel_web_tools/core/runner.py +75 -0
- parallel_web_tools-0.0.5/parallel_web_tools/core/schema.py +167 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/__init__.py +53 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/bigquery/__init__.py +34 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/bigquery/cloud_function/main.py +199 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/bigquery/cloud_function/requirements.txt +5 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/bigquery/deploy.py +510 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/bigquery/sql/create_functions.sql +23 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/duckdb/__init__.py +63 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/duckdb/batch.py +220 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/duckdb/udf.py +159 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/polars/__init__.py +37 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/polars/enrich.py +218 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/snowflake/__init__.py +46 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/snowflake/deploy.py +361 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/snowflake/sql/01_setup.sql +107 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/snowflake/sql/02_create_udf.sql +116 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/snowflake/sql/03_cleanup.sql +61 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/spark/__init__.py +57 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/spark/streaming.py +407 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/spark/udf.py +206 -0
- parallel_web_tools-0.0.5/parallel_web_tools/integrations/utils.py +32 -0
- parallel_web_tools-0.0.5/parallel_web_tools/processors/__init__.py +20 -0
- parallel_web_tools-0.0.5/parallel_web_tools/processors/bigquery.py +70 -0
- parallel_web_tools-0.0.5/parallel_web_tools/processors/csv.py +32 -0
- parallel_web_tools-0.0.5/parallel_web_tools/processors/duckdb.py +25 -0
- parallel_web_tools-0.0.5/pyproject.toml +140 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
|
|
12
|
+
# Environment files
|
|
13
|
+
.env.local
|
|
14
|
+
.env
|
|
15
|
+
|
|
16
|
+
# Data files
|
|
17
|
+
data/
|
|
18
|
+
configs/
|
|
19
|
+
*.db
|
|
20
|
+
output_*.csv
|
|
21
|
+
notebooks/debug_enrichment.ipynb
|
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: parallel-web-tools
|
|
3
|
+
Version: 0.0.5
|
|
4
|
+
Summary: Parallel Tools: CLI and data enrichment utilities for the Parallel API
|
|
5
|
+
Project-URL: Homepage, https://github.com/parallel-web/parallel-web-tools
|
|
6
|
+
Project-URL: Documentation, https://docs.parallel.ai
|
|
7
|
+
Project-URL: Repository, https://github.com/parallel-web/parallel-web-tools
|
|
8
|
+
Project-URL: Issues, https://github.com/parallel-web/parallel-web-tools/issues
|
|
9
|
+
Author-email: Parallel <support@parallel.ai>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
Keywords: ai,data-enrichment,data-pipeline,etl,llm,parallel,web-search
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Database
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.12
|
|
22
|
+
Requires-Dist: click>=8.1.0
|
|
23
|
+
Requires-Dist: httpx>=0.25.0
|
|
24
|
+
Requires-Dist: pandas>=2.3.0
|
|
25
|
+
Requires-Dist: parallel-web>=0.4.0
|
|
26
|
+
Requires-Dist: polars>=1.37.0
|
|
27
|
+
Requires-Dist: pyarrow>=18.0.0
|
|
28
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
29
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
30
|
+
Requires-Dist: questionary>=2.0.0
|
|
31
|
+
Requires-Dist: rich>=13.0.0
|
|
32
|
+
Provides-Extra: all
|
|
33
|
+
Requires-Dist: duckdb>=1.0.0; extra == 'all'
|
|
34
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
|
|
35
|
+
Requires-Dist: sqlalchemy-bigquery>=1.11.0; extra == 'all'
|
|
36
|
+
Requires-Dist: sqlalchemy>=2.0.0; extra == 'all'
|
|
37
|
+
Provides-Extra: bigquery
|
|
38
|
+
Requires-Dist: sqlalchemy-bigquery>=1.11.0; extra == 'bigquery'
|
|
39
|
+
Requires-Dist: sqlalchemy>=2.0.0; extra == 'bigquery'
|
|
40
|
+
Provides-Extra: bigquery-native
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: duckdb>=1.0.0; extra == 'dev'
|
|
43
|
+
Requires-Dist: httpx>=0.25.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: pre-commit>=4.0.0; extra == 'dev'
|
|
45
|
+
Requires-Dist: pyinstaller>=6.0.0; extra == 'dev'
|
|
46
|
+
Requires-Dist: pyrefly>=0.49.0; extra == 'dev'
|
|
47
|
+
Requires-Dist: pyspark>=3.4.0; extra == 'dev'
|
|
48
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
49
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
50
|
+
Requires-Dist: ruff>=0.14.0; extra == 'dev'
|
|
51
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: sqlalchemy-bigquery>=1.11.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: sqlalchemy>=2.0.0; extra == 'dev'
|
|
54
|
+
Provides-Extra: duckdb
|
|
55
|
+
Requires-Dist: duckdb>=1.0.0; extra == 'duckdb'
|
|
56
|
+
Provides-Extra: polars
|
|
57
|
+
Provides-Extra: snowflake
|
|
58
|
+
Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
|
|
59
|
+
Provides-Extra: spark
|
|
60
|
+
Requires-Dist: httpx>=0.25.0; extra == 'spark'
|
|
61
|
+
Requires-Dist: pyspark>=3.4.0; extra == 'spark'
|
|
62
|
+
Description-Content-Type: text/markdown
|
|
63
|
+
|
|
64
|
+
# Parallel-Web-Tools
|
|
65
|
+
|
|
66
|
+
CLI and data enrichment utilities for the [Parallel API](https://docs.parallel.ai).
|
|
67
|
+
|
|
68
|
+
> **Note:** This package provides the `parallel-cli` command-line tool and data enrichment utilities in the `parallel-web-tools` package.
|
|
69
|
+
> It depends on [`parallel-web`](https://github.com/parallel-web/parallel-sdk-python), the official
|
|
70
|
+
> Parallel Python SDK, but does not contain it. Install `parallel-web` separately if you need
|
|
71
|
+
> direct SDK access.
|
|
72
|
+
|
|
73
|
+
## Features
|
|
74
|
+
|
|
75
|
+
- **CLI for Humans & AI Agents** - Works interactively or fully via command-line arguments
|
|
76
|
+
- **Web Search** - AI-powered search with domain filtering and date ranges
|
|
77
|
+
- **Content Extraction** - Extract clean markdown from any URL
|
|
78
|
+
- **Data Enrichment** - Enrich CSV, DuckDB, and BigQuery data with AI
|
|
79
|
+
- **AI-Assisted Planning** - Use natural language to define what data you want
|
|
80
|
+
- **Multiple Integrations** - Polars, DuckDB, Snowflake, BigQuery, Spark
|
|
81
|
+
|
|
82
|
+
## Installation
|
|
83
|
+
|
|
84
|
+
### Standalone CLI (Recommended)
|
|
85
|
+
|
|
86
|
+
Install the standalone `parallel-cli` binary for search, extract, enrichment, and deep research (no Python required):
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
curl -fsSL https://raw.githubusercontent.com/parallel-web/parallel-web-tools/main/install-cli.sh | bash
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
This automatically detects your platform (macOS/Linux, x64/arm64) and installs to `~/.local/bin`.
|
|
93
|
+
|
|
94
|
+
> **Note:** The standalone binary includes core CLI features. For deployment commands (`enrich deploy`), use pip: `pip install parallel-web-tools[snowflake]` or `[bigquery]`.
|
|
95
|
+
|
|
96
|
+
### Python Package
|
|
97
|
+
|
|
98
|
+
For programmatic usage or data enrichment integrations:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# Full install with CLI and all connectors
|
|
102
|
+
pip install parallel-web-tools[all]
|
|
103
|
+
|
|
104
|
+
# Library only (minimal dependencies)
|
|
105
|
+
pip install parallel-web-tools
|
|
106
|
+
|
|
107
|
+
# With specific connectors
|
|
108
|
+
pip install parallel-web-tools[cli] # CLI only
|
|
109
|
+
pip install parallel-web-tools[polars] # Polars DataFrame
|
|
110
|
+
pip install parallel-web-tools[duckdb] # DuckDB
|
|
111
|
+
pip install parallel-web-tools[bigquery] # BigQuery
|
|
112
|
+
pip install parallel-web-tools[spark] # Apache Spark
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## CLI Overview
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
parallel-cli
|
|
119
|
+
├── auth # Check authentication status
|
|
120
|
+
├── login # OAuth login (or use PARALLEL_API_KEY env var)
|
|
121
|
+
├── logout # Remove stored credentials
|
|
122
|
+
├── search # Web search
|
|
123
|
+
├── extract # Extract content from URLs
|
|
124
|
+
└── enrich # Data enrichment commands
|
|
125
|
+
├── run # Run enrichment
|
|
126
|
+
├── plan # Create YAML config
|
|
127
|
+
├── suggest # AI suggests output columns
|
|
128
|
+
└── deploy # Deploy to cloud systems (requires pip install)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Quick Start
|
|
132
|
+
|
|
133
|
+
### 1. Authenticate
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
# Interactive OAuth login
|
|
137
|
+
parallel-cli login
|
|
138
|
+
|
|
139
|
+
# Or set environment variable
|
|
140
|
+
export PARALLEL_API_KEY=your_api_key
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### 2. Search the Web
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
# Natural language search
|
|
147
|
+
parallel-cli search "What is Anthropic's latest AI model?" --json
|
|
148
|
+
|
|
149
|
+
# Keyword search with filters
|
|
150
|
+
parallel-cli search -q "bitcoin price" --after-date 2024-01-01 --json
|
|
151
|
+
|
|
152
|
+
# Search specific domains
|
|
153
|
+
parallel-cli search "SEC filings for Apple" --include-domains sec.gov --json
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### 3. Extract Content from URLs
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
# Extract content as markdown
|
|
160
|
+
parallel-cli extract https://example.com --json
|
|
161
|
+
|
|
162
|
+
# Extract with a specific focus
|
|
163
|
+
parallel-cli extract https://company.com --objective "Find pricing info" --json
|
|
164
|
+
|
|
165
|
+
# Get full page content
|
|
166
|
+
parallel-cli extract https://example.com --full-content --json
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### 4. Enrich Data
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
# Let AI suggest what columns to add
|
|
173
|
+
parallel-cli enrich suggest "Find the CEO and annual revenue" --json
|
|
174
|
+
|
|
175
|
+
# Create a config file (interactive)
|
|
176
|
+
parallel-cli enrich plan -o config.yaml
|
|
177
|
+
|
|
178
|
+
# Create a config file (non-interactive, for AI agents)
|
|
179
|
+
parallel-cli enrich plan -o config.yaml \
|
|
180
|
+
--source-type csv \
|
|
181
|
+
--source companies.csv \
|
|
182
|
+
--target enriched.csv \
|
|
183
|
+
--source-columns '[{"name": "company", "description": "Company name"}]' \
|
|
184
|
+
--intent "Find the CEO and annual revenue"
|
|
185
|
+
|
|
186
|
+
# Run enrichment from config
|
|
187
|
+
parallel-cli enrich run config.yaml
|
|
188
|
+
|
|
189
|
+
# Run enrichment directly (no config file needed)
|
|
190
|
+
parallel-cli enrich run \
|
|
191
|
+
--source-type csv \
|
|
192
|
+
--source companies.csv \
|
|
193
|
+
--target enriched.csv \
|
|
194
|
+
--source-columns '[{"name": "company", "description": "Company name"}]' \
|
|
195
|
+
--intent "Find the CEO and annual revenue"
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### 5. Deploy to Cloud Systems
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
# Deploy to BigQuery for SQL-native enrichment
|
|
202
|
+
parallel-cli enrich deploy --system bigquery --project my-gcp-project
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Non-Interactive Mode (for AI Agents & Scripts)
|
|
206
|
+
|
|
207
|
+
All commands support `--json` output and can be fully controlled via CLI arguments:
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
# Search with JSON output
|
|
211
|
+
parallel-cli search "query" --json
|
|
212
|
+
|
|
213
|
+
# Extract with JSON output
|
|
214
|
+
parallel-cli extract https://url.com --json
|
|
215
|
+
|
|
216
|
+
# Suggest columns with JSON output
|
|
217
|
+
parallel-cli enrich suggest "Find CEO" --json
|
|
218
|
+
|
|
219
|
+
# Plan without prompts (provide all args)
|
|
220
|
+
parallel-cli enrich plan -o config.yaml \
|
|
221
|
+
--source-type csv \
|
|
222
|
+
--source input.csv \
|
|
223
|
+
--target output.csv \
|
|
224
|
+
--source-columns '[{"name": "company", "description": "Company name"}]' \
|
|
225
|
+
--enriched-columns '[{"name": "ceo", "description": "CEO name"}]'
|
|
226
|
+
|
|
227
|
+
# Or use --intent to let AI determine the columns
|
|
228
|
+
parallel-cli enrich plan -o config.yaml \
|
|
229
|
+
--source-type csv \
|
|
230
|
+
--source input.csv \
|
|
231
|
+
--target output.csv \
|
|
232
|
+
--source-columns '[{"name": "company", "description": "Company name"}]' \
|
|
233
|
+
--intent "Find CEO, revenue, and headquarters"
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## Integrations
|
|
237
|
+
|
|
238
|
+
| Integration | Type | Install | Documentation |
|
|
239
|
+
|-------------|------|---------|---------------|
|
|
240
|
+
| **Polars** | Python DataFrame | `pip install parallel-web-tools[polars]` | [Setup Guide](docs/polars-setup.md) |
|
|
241
|
+
| **DuckDB** | SQL + Python | `pip install parallel-web-tools[duckdb]` | [Setup Guide](docs/duckdb-setup.md) |
|
|
242
|
+
| **Snowflake** | SQL UDF | `pip install parallel-web-tools[snowflake]` | [Setup Guide](docs/snowflake-setup.md) |
|
|
243
|
+
| **BigQuery** | Cloud Function | `pip install parallel-web-tools[bigquery]` | [Setup Guide](docs/bigquery-setup.md) |
|
|
244
|
+
| **Spark** | SQL UDF | `pip install parallel-web-tools[spark]` | [Demo Notebook](notebooks/spark_enrichment_demo.ipynb) |
|
|
245
|
+
|
|
246
|
+
### Quick Integration Examples
|
|
247
|
+
|
|
248
|
+
**Polars:**
|
|
249
|
+
```python
|
|
250
|
+
import polars as pl
|
|
251
|
+
from parallel_web_tools.integrations.polars import parallel_enrich
|
|
252
|
+
|
|
253
|
+
df = pl.DataFrame({"company": ["Google", "Microsoft"]})
|
|
254
|
+
result = parallel_enrich(
|
|
255
|
+
df,
|
|
256
|
+
input_columns={"company_name": "company"},
|
|
257
|
+
output_columns=["CEO name", "Founding year"],
|
|
258
|
+
)
|
|
259
|
+
print(result.result)
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
**DuckDB:**
|
|
263
|
+
```python
|
|
264
|
+
import duckdb
|
|
265
|
+
from parallel_web_tools.integrations.duckdb import enrich_table
|
|
266
|
+
|
|
267
|
+
conn = duckdb.connect()
|
|
268
|
+
conn.execute("CREATE TABLE companies AS SELECT 'Google' as name")
|
|
269
|
+
result = enrich_table(
|
|
270
|
+
conn,
|
|
271
|
+
source_table="companies",
|
|
272
|
+
input_columns={"company_name": "name"},
|
|
273
|
+
output_columns=["CEO name", "Founding year"],
|
|
274
|
+
)
|
|
275
|
+
print(result.result.fetchdf())
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Programmatic Usage
|
|
279
|
+
|
|
280
|
+
```python
|
|
281
|
+
from parallel_web_tools import run_enrichment, run_enrichment_from_dict
|
|
282
|
+
|
|
283
|
+
# From YAML file
|
|
284
|
+
run_enrichment("config.yaml")
|
|
285
|
+
|
|
286
|
+
# From dictionary
|
|
287
|
+
run_enrichment_from_dict({
|
|
288
|
+
"source": "data.csv",
|
|
289
|
+
"target": "enriched.csv",
|
|
290
|
+
"source_type": "csv",
|
|
291
|
+
"source_columns": [{"name": "company", "description": "Company name"}],
|
|
292
|
+
"enriched_columns": [{"name": "ceo", "description": "CEO name"}]
|
|
293
|
+
})
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## YAML Configuration Format
|
|
297
|
+
|
|
298
|
+
```yaml
|
|
299
|
+
source: input.csv
|
|
300
|
+
target: output.csv
|
|
301
|
+
source_type: csv # csv, duckdb, or bigquery
|
|
302
|
+
processor: core-fast # lite, base, core, pro, ultra (add -fast for speed)
|
|
303
|
+
|
|
304
|
+
source_columns:
|
|
305
|
+
- name: company_name
|
|
306
|
+
description: The name of the company
|
|
307
|
+
|
|
308
|
+
enriched_columns:
|
|
309
|
+
- name: ceo
|
|
310
|
+
description: The CEO of the company
|
|
311
|
+
type: str # str, int, float, bool
|
|
312
|
+
- name: revenue
|
|
313
|
+
description: Annual revenue in USD
|
|
314
|
+
type: float
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## Environment Variables
|
|
318
|
+
|
|
319
|
+
| Variable | Description |
|
|
320
|
+
|----------|-------------|
|
|
321
|
+
| `PARALLEL_API_KEY` | API key for authentication (alternative to `parallel-cli login`) |
|
|
322
|
+
| `DUCKDB_FILE` | Default DuckDB file path |
|
|
323
|
+
| `BIGQUERY_PROJECT` | Default BigQuery project ID |
|
|
324
|
+
|
|
325
|
+
## Related Packages
|
|
326
|
+
|
|
327
|
+
- [`parallel-web`](https://github.com/parallel-web/parallel-sdk-python) - Official Parallel Python SDK (this package depends on it)
|
|
328
|
+
|
|
329
|
+
## Development
|
|
330
|
+
|
|
331
|
+
```bash
|
|
332
|
+
git clone https://github.com/parallel-web/parallel-web-tools.git
|
|
333
|
+
cd parallel-web-tools
|
|
334
|
+
uv sync --all-extras
|
|
335
|
+
uv run pytest tests/ -v
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
## License
|
|
339
|
+
|
|
340
|
+
MIT
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# Parallel-Web-Tools
|
|
2
|
+
|
|
3
|
+
CLI and data enrichment utilities for the [Parallel API](https://docs.parallel.ai).
|
|
4
|
+
|
|
5
|
+
> **Note:** This package provides the `parallel-cli` command-line tool and data enrichment utilities in the `parallel-web-tools` package.
|
|
6
|
+
> It depends on [`parallel-web`](https://github.com/parallel-web/parallel-sdk-python), the official
|
|
7
|
+
> Parallel Python SDK, but does not contain it. Install `parallel-web` separately if you need
|
|
8
|
+
> direct SDK access.
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
- **CLI for Humans & AI Agents** - Works interactively or fully via command-line arguments
|
|
13
|
+
- **Web Search** - AI-powered search with domain filtering and date ranges
|
|
14
|
+
- **Content Extraction** - Extract clean markdown from any URL
|
|
15
|
+
- **Data Enrichment** - Enrich CSV, DuckDB, and BigQuery data with AI
|
|
16
|
+
- **AI-Assisted Planning** - Use natural language to define what data you want
|
|
17
|
+
- **Multiple Integrations** - Polars, DuckDB, Snowflake, BigQuery, Spark
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
### Standalone CLI (Recommended)
|
|
22
|
+
|
|
23
|
+
Install the standalone `parallel-cli` binary for search, extract, enrichment, and deep research (no Python required):
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
curl -fsSL https://raw.githubusercontent.com/parallel-web/parallel-web-tools/main/install-cli.sh | bash
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
This automatically detects your platform (macOS/Linux, x64/arm64) and installs to `~/.local/bin`.
|
|
30
|
+
|
|
31
|
+
> **Note:** The standalone binary includes core CLI features. For deployment commands (`enrich deploy`), use pip: `pip install parallel-web-tools[snowflake]` or `[bigquery]`.
|
|
32
|
+
|
|
33
|
+
### Python Package
|
|
34
|
+
|
|
35
|
+
For programmatic usage or data enrichment integrations:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# Full install with CLI and all connectors
|
|
39
|
+
pip install parallel-web-tools[all]
|
|
40
|
+
|
|
41
|
+
# Library only (minimal dependencies)
|
|
42
|
+
pip install parallel-web-tools
|
|
43
|
+
|
|
44
|
+
# With specific connectors
|
|
45
|
+
pip install parallel-web-tools[cli] # CLI only
|
|
46
|
+
pip install parallel-web-tools[polars] # Polars DataFrame
|
|
47
|
+
pip install parallel-web-tools[duckdb] # DuckDB
|
|
48
|
+
pip install parallel-web-tools[bigquery] # BigQuery
|
|
49
|
+
pip install parallel-web-tools[spark] # Apache Spark
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## CLI Overview
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
parallel-cli
|
|
56
|
+
├── auth # Check authentication status
|
|
57
|
+
├── login # OAuth login (or use PARALLEL_API_KEY env var)
|
|
58
|
+
├── logout # Remove stored credentials
|
|
59
|
+
├── search # Web search
|
|
60
|
+
├── extract # Extract content from URLs
|
|
61
|
+
└── enrich # Data enrichment commands
|
|
62
|
+
├── run # Run enrichment
|
|
63
|
+
├── plan # Create YAML config
|
|
64
|
+
├── suggest # AI suggests output columns
|
|
65
|
+
└── deploy # Deploy to cloud systems (requires pip install)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Quick Start
|
|
69
|
+
|
|
70
|
+
### 1. Authenticate
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# Interactive OAuth login
|
|
74
|
+
parallel-cli login
|
|
75
|
+
|
|
76
|
+
# Or set environment variable
|
|
77
|
+
export PARALLEL_API_KEY=your_api_key
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### 2. Search the Web
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# Natural language search
|
|
84
|
+
parallel-cli search "What is Anthropic's latest AI model?" --json
|
|
85
|
+
|
|
86
|
+
# Keyword search with filters
|
|
87
|
+
parallel-cli search -q "bitcoin price" --after-date 2024-01-01 --json
|
|
88
|
+
|
|
89
|
+
# Search specific domains
|
|
90
|
+
parallel-cli search "SEC filings for Apple" --include-domains sec.gov --json
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### 3. Extract Content from URLs
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Extract content as markdown
|
|
97
|
+
parallel-cli extract https://example.com --json
|
|
98
|
+
|
|
99
|
+
# Extract with a specific focus
|
|
100
|
+
parallel-cli extract https://company.com --objective "Find pricing info" --json
|
|
101
|
+
|
|
102
|
+
# Get full page content
|
|
103
|
+
parallel-cli extract https://example.com --full-content --json
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 4. Enrich Data
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# Let AI suggest what columns to add
|
|
110
|
+
parallel-cli enrich suggest "Find the CEO and annual revenue" --json
|
|
111
|
+
|
|
112
|
+
# Create a config file (interactive)
|
|
113
|
+
parallel-cli enrich plan -o config.yaml
|
|
114
|
+
|
|
115
|
+
# Create a config file (non-interactive, for AI agents)
|
|
116
|
+
parallel-cli enrich plan -o config.yaml \
|
|
117
|
+
--source-type csv \
|
|
118
|
+
--source companies.csv \
|
|
119
|
+
--target enriched.csv \
|
|
120
|
+
--source-columns '[{"name": "company", "description": "Company name"}]' \
|
|
121
|
+
--intent "Find the CEO and annual revenue"
|
|
122
|
+
|
|
123
|
+
# Run enrichment from config
|
|
124
|
+
parallel-cli enrich run config.yaml
|
|
125
|
+
|
|
126
|
+
# Run enrichment directly (no config file needed)
|
|
127
|
+
parallel-cli enrich run \
|
|
128
|
+
--source-type csv \
|
|
129
|
+
--source companies.csv \
|
|
130
|
+
--target enriched.csv \
|
|
131
|
+
--source-columns '[{"name": "company", "description": "Company name"}]' \
|
|
132
|
+
--intent "Find the CEO and annual revenue"
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### 5. Deploy to Cloud Systems
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
# Deploy to BigQuery for SQL-native enrichment
|
|
139
|
+
parallel-cli enrich deploy --system bigquery --project my-gcp-project
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Non-Interactive Mode (for AI Agents & Scripts)
|
|
143
|
+
|
|
144
|
+
All commands support `--json` output and can be fully controlled via CLI arguments:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
# Search with JSON output
|
|
148
|
+
parallel-cli search "query" --json
|
|
149
|
+
|
|
150
|
+
# Extract with JSON output
|
|
151
|
+
parallel-cli extract https://url.com --json
|
|
152
|
+
|
|
153
|
+
# Suggest columns with JSON output
|
|
154
|
+
parallel-cli enrich suggest "Find CEO" --json
|
|
155
|
+
|
|
156
|
+
# Plan without prompts (provide all args)
|
|
157
|
+
parallel-cli enrich plan -o config.yaml \
|
|
158
|
+
--source-type csv \
|
|
159
|
+
--source input.csv \
|
|
160
|
+
--target output.csv \
|
|
161
|
+
--source-columns '[{"name": "company", "description": "Company name"}]' \
|
|
162
|
+
--enriched-columns '[{"name": "ceo", "description": "CEO name"}]'
|
|
163
|
+
|
|
164
|
+
# Or use --intent to let AI determine the columns
|
|
165
|
+
parallel-cli enrich plan -o config.yaml \
|
|
166
|
+
--source-type csv \
|
|
167
|
+
--source input.csv \
|
|
168
|
+
--target output.csv \
|
|
169
|
+
--source-columns '[{"name": "company", "description": "Company name"}]' \
|
|
170
|
+
--intent "Find CEO, revenue, and headquarters"
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Integrations
|
|
174
|
+
|
|
175
|
+
| Integration | Type | Install | Documentation |
|
|
176
|
+
|-------------|------|---------|---------------|
|
|
177
|
+
| **Polars** | Python DataFrame | `pip install parallel-web-tools[polars]` | [Setup Guide](docs/polars-setup.md) |
|
|
178
|
+
| **DuckDB** | SQL + Python | `pip install parallel-web-tools[duckdb]` | [Setup Guide](docs/duckdb-setup.md) |
|
|
179
|
+
| **Snowflake** | SQL UDF | `pip install parallel-web-tools[snowflake]` | [Setup Guide](docs/snowflake-setup.md) |
|
|
180
|
+
| **BigQuery** | Cloud Function | `pip install parallel-web-tools[bigquery]` | [Setup Guide](docs/bigquery-setup.md) |
|
|
181
|
+
| **Spark** | SQL UDF | `pip install parallel-web-tools[spark]` | [Demo Notebook](notebooks/spark_enrichment_demo.ipynb) |
|
|
182
|
+
|
|
183
|
+
### Quick Integration Examples
|
|
184
|
+
|
|
185
|
+
**Polars:**
|
|
186
|
+
```python
|
|
187
|
+
import polars as pl
|
|
188
|
+
from parallel_web_tools.integrations.polars import parallel_enrich
|
|
189
|
+
|
|
190
|
+
df = pl.DataFrame({"company": ["Google", "Microsoft"]})
|
|
191
|
+
result = parallel_enrich(
|
|
192
|
+
df,
|
|
193
|
+
input_columns={"company_name": "company"},
|
|
194
|
+
output_columns=["CEO name", "Founding year"],
|
|
195
|
+
)
|
|
196
|
+
print(result.result)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
**DuckDB:**
|
|
200
|
+
```python
|
|
201
|
+
import duckdb
|
|
202
|
+
from parallel_web_tools.integrations.duckdb import enrich_table
|
|
203
|
+
|
|
204
|
+
conn = duckdb.connect()
|
|
205
|
+
conn.execute("CREATE TABLE companies AS SELECT 'Google' as name")
|
|
206
|
+
result = enrich_table(
|
|
207
|
+
conn,
|
|
208
|
+
source_table="companies",
|
|
209
|
+
input_columns={"company_name": "name"},
|
|
210
|
+
output_columns=["CEO name", "Founding year"],
|
|
211
|
+
)
|
|
212
|
+
print(result.result.fetchdf())
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Programmatic Usage
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
from parallel_web_tools import run_enrichment, run_enrichment_from_dict
|
|
219
|
+
|
|
220
|
+
# From YAML file
|
|
221
|
+
run_enrichment("config.yaml")
|
|
222
|
+
|
|
223
|
+
# From dictionary
|
|
224
|
+
run_enrichment_from_dict({
|
|
225
|
+
"source": "data.csv",
|
|
226
|
+
"target": "enriched.csv",
|
|
227
|
+
"source_type": "csv",
|
|
228
|
+
"source_columns": [{"name": "company", "description": "Company name"}],
|
|
229
|
+
"enriched_columns": [{"name": "ceo", "description": "CEO name"}]
|
|
230
|
+
})
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## YAML Configuration Format
|
|
234
|
+
|
|
235
|
+
```yaml
|
|
236
|
+
source: input.csv
|
|
237
|
+
target: output.csv
|
|
238
|
+
source_type: csv # csv, duckdb, or bigquery
|
|
239
|
+
processor: core-fast # lite, base, core, pro, ultra (add -fast for speed)
|
|
240
|
+
|
|
241
|
+
source_columns:
|
|
242
|
+
- name: company_name
|
|
243
|
+
description: The name of the company
|
|
244
|
+
|
|
245
|
+
enriched_columns:
|
|
246
|
+
- name: ceo
|
|
247
|
+
description: The CEO of the company
|
|
248
|
+
type: str # str, int, float, bool
|
|
249
|
+
- name: revenue
|
|
250
|
+
description: Annual revenue in USD
|
|
251
|
+
type: float
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
## Environment Variables
|
|
255
|
+
|
|
256
|
+
| Variable | Description |
|
|
257
|
+
|----------|-------------|
|
|
258
|
+
| `PARALLEL_API_KEY` | API key for authentication (alternative to `parallel-cli login`) |
|
|
259
|
+
| `DUCKDB_FILE` | Default DuckDB file path |
|
|
260
|
+
| `BIGQUERY_PROJECT` | Default BigQuery project ID |
|
|
261
|
+
|
|
262
|
+
## Related Packages
|
|
263
|
+
|
|
264
|
+
- [`parallel-web`](https://github.com/parallel-web/parallel-sdk-python) - Official Parallel Python SDK (this package depends on it)
|
|
265
|
+
|
|
266
|
+
## Development
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
git clone https://github.com/parallel-web/parallel-web-tools.git
|
|
270
|
+
cd parallel-web-tools
|
|
271
|
+
uv sync --all-extras
|
|
272
|
+
uv run pytest tests/ -v
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
## License
|
|
276
|
+
|
|
277
|
+
MIT
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Parallel Data Enrichment package."""
|
|
2
|
+
|
|
3
|
+
# Re-export everything from core for convenience
|
|
4
|
+
from parallel_web_tools.core import (
|
|
5
|
+
# Schema
|
|
6
|
+
AVAILABLE_PROCESSORS,
|
|
7
|
+
Column,
|
|
8
|
+
InputSchema,
|
|
9
|
+
ParseError,
|
|
10
|
+
ProcessorType,
|
|
11
|
+
SourceType,
|
|
12
|
+
# Batch
|
|
13
|
+
enrich_batch,
|
|
14
|
+
enrich_single,
|
|
15
|
+
# Auth
|
|
16
|
+
get_api_key,
|
|
17
|
+
get_async_client,
|
|
18
|
+
get_auth_status,
|
|
19
|
+
get_client,
|
|
20
|
+
load_schema,
|
|
21
|
+
logout,
|
|
22
|
+
parse_input_and_output_models,
|
|
23
|
+
parse_schema,
|
|
24
|
+
# Runner
|
|
25
|
+
run_enrichment,
|
|
26
|
+
run_enrichment_from_dict,
|
|
27
|
+
run_tasks,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__version__ = "0.0.5"
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
# Auth
|
|
34
|
+
"get_api_key",
|
|
35
|
+
"get_auth_status",
|
|
36
|
+
"get_client",
|
|
37
|
+
"get_async_client",
|
|
38
|
+
"logout",
|
|
39
|
+
# Schema
|
|
40
|
+
"AVAILABLE_PROCESSORS",
|
|
41
|
+
"Column",
|
|
42
|
+
"InputSchema",
|
|
43
|
+
"ParseError",
|
|
44
|
+
"ProcessorType",
|
|
45
|
+
"SourceType",
|
|
46
|
+
"load_schema",
|
|
47
|
+
"parse_schema",
|
|
48
|
+
"parse_input_and_output_models",
|
|
49
|
+
# Batch
|
|
50
|
+
"enrich_batch",
|
|
51
|
+
"enrich_single",
|
|
52
|
+
"run_tasks",
|
|
53
|
+
# Runner
|
|
54
|
+
"run_enrichment",
|
|
55
|
+
"run_enrichment_from_dict",
|
|
56
|
+
]
|