parallel-web-tools 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. parallel_web_tools-0.0.5/.gitignore +21 -0
  2. parallel_web_tools-0.0.5/PKG-INFO +340 -0
  3. parallel_web_tools-0.0.5/README.md +277 -0
  4. parallel_web_tools-0.0.5/parallel_web_tools/__init__.py +56 -0
  5. parallel_web_tools-0.0.5/parallel_web_tools/cli/__init__.py +5 -0
  6. parallel_web_tools-0.0.5/parallel_web_tools/cli/commands.py +1103 -0
  7. parallel_web_tools-0.0.5/parallel_web_tools/cli/planner.py +438 -0
  8. parallel_web_tools-0.0.5/parallel_web_tools/core/__init__.py +85 -0
  9. parallel_web_tools-0.0.5/parallel_web_tools/core/auth.py +257 -0
  10. parallel_web_tools-0.0.5/parallel_web_tools/core/batch.py +280 -0
  11. parallel_web_tools-0.0.5/parallel_web_tools/core/research.py +288 -0
  12. parallel_web_tools-0.0.5/parallel_web_tools/core/result.py +29 -0
  13. parallel_web_tools-0.0.5/parallel_web_tools/core/runner.py +75 -0
  14. parallel_web_tools-0.0.5/parallel_web_tools/core/schema.py +167 -0
  15. parallel_web_tools-0.0.5/parallel_web_tools/integrations/__init__.py +53 -0
  16. parallel_web_tools-0.0.5/parallel_web_tools/integrations/bigquery/__init__.py +34 -0
  17. parallel_web_tools-0.0.5/parallel_web_tools/integrations/bigquery/cloud_function/main.py +199 -0
  18. parallel_web_tools-0.0.5/parallel_web_tools/integrations/bigquery/cloud_function/requirements.txt +5 -0
  19. parallel_web_tools-0.0.5/parallel_web_tools/integrations/bigquery/deploy.py +510 -0
  20. parallel_web_tools-0.0.5/parallel_web_tools/integrations/bigquery/sql/create_functions.sql +23 -0
  21. parallel_web_tools-0.0.5/parallel_web_tools/integrations/duckdb/__init__.py +63 -0
  22. parallel_web_tools-0.0.5/parallel_web_tools/integrations/duckdb/batch.py +220 -0
  23. parallel_web_tools-0.0.5/parallel_web_tools/integrations/duckdb/udf.py +159 -0
  24. parallel_web_tools-0.0.5/parallel_web_tools/integrations/polars/__init__.py +37 -0
  25. parallel_web_tools-0.0.5/parallel_web_tools/integrations/polars/enrich.py +218 -0
  26. parallel_web_tools-0.0.5/parallel_web_tools/integrations/snowflake/__init__.py +46 -0
  27. parallel_web_tools-0.0.5/parallel_web_tools/integrations/snowflake/deploy.py +361 -0
  28. parallel_web_tools-0.0.5/parallel_web_tools/integrations/snowflake/sql/01_setup.sql +107 -0
  29. parallel_web_tools-0.0.5/parallel_web_tools/integrations/snowflake/sql/02_create_udf.sql +116 -0
  30. parallel_web_tools-0.0.5/parallel_web_tools/integrations/snowflake/sql/03_cleanup.sql +61 -0
  31. parallel_web_tools-0.0.5/parallel_web_tools/integrations/spark/__init__.py +57 -0
  32. parallel_web_tools-0.0.5/parallel_web_tools/integrations/spark/streaming.py +407 -0
  33. parallel_web_tools-0.0.5/parallel_web_tools/integrations/spark/udf.py +206 -0
  34. parallel_web_tools-0.0.5/parallel_web_tools/integrations/utils.py +32 -0
  35. parallel_web_tools-0.0.5/parallel_web_tools/processors/__init__.py +20 -0
  36. parallel_web_tools-0.0.5/parallel_web_tools/processors/bigquery.py +70 -0
  37. parallel_web_tools-0.0.5/parallel_web_tools/processors/csv.py +32 -0
  38. parallel_web_tools-0.0.5/parallel_web_tools/processors/duckdb.py +25 -0
  39. parallel_web_tools-0.0.5/pyproject.toml +140 -0
@@ -0,0 +1,21 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # Environment files
13
+ .env.local
14
+ .env
15
+
16
+ # Data files
17
+ data/
18
+ configs/
19
+ *.db
20
+ output_*.csv
21
+ notebooks/debug_enrichment.ipynb
@@ -0,0 +1,340 @@
1
+ Metadata-Version: 2.4
2
+ Name: parallel-web-tools
3
+ Version: 0.0.5
4
+ Summary: Parallel Tools: CLI and data enrichment utilities for the Parallel API
5
+ Project-URL: Homepage, https://github.com/parallel-web/parallel-web-tools
6
+ Project-URL: Documentation, https://docs.parallel.ai
7
+ Project-URL: Repository, https://github.com/parallel-web/parallel-web-tools
8
+ Project-URL: Issues, https://github.com/parallel-web/parallel-web-tools/issues
9
+ Author-email: Parallel <support@parallel.ai>
10
+ License-Expression: MIT
11
+ Keywords: ai,data-enrichment,data-pipeline,etl,llm,parallel,web-search
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Database
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: >=3.12
22
+ Requires-Dist: click>=8.1.0
23
+ Requires-Dist: httpx>=0.25.0
24
+ Requires-Dist: pandas>=2.3.0
25
+ Requires-Dist: parallel-web>=0.4.0
26
+ Requires-Dist: polars>=1.37.0
27
+ Requires-Dist: pyarrow>=18.0.0
28
+ Requires-Dist: python-dotenv>=1.0.0
29
+ Requires-Dist: pyyaml>=6.0.0
30
+ Requires-Dist: questionary>=2.0.0
31
+ Requires-Dist: rich>=13.0.0
32
+ Provides-Extra: all
33
+ Requires-Dist: duckdb>=1.0.0; extra == 'all'
34
+ Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'all'
35
+ Requires-Dist: sqlalchemy-bigquery>=1.11.0; extra == 'all'
36
+ Requires-Dist: sqlalchemy>=2.0.0; extra == 'all'
37
+ Provides-Extra: bigquery
38
+ Requires-Dist: sqlalchemy-bigquery>=1.11.0; extra == 'bigquery'
39
+ Requires-Dist: sqlalchemy>=2.0.0; extra == 'bigquery'
40
+ Provides-Extra: bigquery-native
41
+ Provides-Extra: dev
42
+ Requires-Dist: duckdb>=1.0.0; extra == 'dev'
43
+ Requires-Dist: httpx>=0.25.0; extra == 'dev'
44
+ Requires-Dist: pre-commit>=4.0.0; extra == 'dev'
45
+ Requires-Dist: pyinstaller>=6.0.0; extra == 'dev'
46
+ Requires-Dist: pyrefly>=0.49.0; extra == 'dev'
47
+ Requires-Dist: pyspark>=3.4.0; extra == 'dev'
48
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
49
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
50
+ Requires-Dist: ruff>=0.14.0; extra == 'dev'
51
+ Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'dev'
52
+ Requires-Dist: sqlalchemy-bigquery>=1.11.0; extra == 'dev'
53
+ Requires-Dist: sqlalchemy>=2.0.0; extra == 'dev'
54
+ Provides-Extra: duckdb
55
+ Requires-Dist: duckdb>=1.0.0; extra == 'duckdb'
56
+ Provides-Extra: polars
57
+ Provides-Extra: snowflake
58
+ Requires-Dist: snowflake-connector-python>=3.0.0; extra == 'snowflake'
59
+ Provides-Extra: spark
60
+ Requires-Dist: httpx>=0.25.0; extra == 'spark'
61
+ Requires-Dist: pyspark>=3.4.0; extra == 'spark'
62
+ Description-Content-Type: text/markdown
63
+
64
+ # Parallel-Web-Tools
65
+
66
+ CLI and data enrichment utilities for the [Parallel API](https://docs.parallel.ai).
67
+
68
+ > **Note:** This package provides the `parallel-cli` command-line tool and data enrichment utilities in the `parallel-web-tools` package.
69
+ > It depends on [`parallel-web`](https://github.com/parallel-web/parallel-sdk-python), the official
70
+ > Parallel Python SDK, but does not contain it. Install `parallel-web` separately if you need
71
+ > direct SDK access.
72
+
73
+ ## Features
74
+
75
+ - **CLI for Humans & AI Agents** - Works interactively or fully via command-line arguments
76
+ - **Web Search** - AI-powered search with domain filtering and date ranges
77
+ - **Content Extraction** - Extract clean markdown from any URL
78
+ - **Data Enrichment** - Enrich CSV, DuckDB, and BigQuery data with AI
79
+ - **AI-Assisted Planning** - Use natural language to define what data you want
80
+ - **Multiple Integrations** - Polars, DuckDB, Snowflake, BigQuery, Spark
81
+
82
+ ## Installation
83
+
84
+ ### Standalone CLI (Recommended)
85
+
86
+ Install the standalone `parallel-cli` binary for search, extract, enrichment, and deep research (no Python required):
87
+
88
+ ```bash
89
+ curl -fsSL https://raw.githubusercontent.com/parallel-web/parallel-web-tools/main/install-cli.sh | bash
90
+ ```
91
+
92
+ This automatically detects your platform (macOS/Linux, x64/arm64) and installs to `~/.local/bin`.
93
+
94
+ > **Note:** The standalone binary includes core CLI features. For deployment commands (`enrich deploy`), use pip: `pip install parallel-web-tools[snowflake]` or `[bigquery]`.
95
+
96
+ ### Python Package
97
+
98
+ For programmatic usage or data enrichment integrations:
99
+
100
+ ```bash
101
+ # Full install with CLI and all connectors
102
+ pip install parallel-web-tools[all]
103
+
104
+ # Library only (minimal dependencies)
105
+ pip install parallel-web-tools
106
+
107
+ # With specific connectors
108
+ pip install parallel-web-tools[cli] # CLI only
109
+ pip install parallel-web-tools[polars] # Polars DataFrame
110
+ pip install parallel-web-tools[duckdb] # DuckDB
111
+ pip install parallel-web-tools[bigquery] # BigQuery
112
+ pip install parallel-web-tools[spark] # Apache Spark
113
+ ```
114
+
115
+ ## CLI Overview
116
+
117
+ ```
118
+ parallel-cli
119
+ ├── auth # Check authentication status
120
+ ├── login # OAuth login (or use PARALLEL_API_KEY env var)
121
+ ├── logout # Remove stored credentials
122
+ ├── search # Web search
123
+ ├── extract # Extract content from URLs
124
+ └── enrich # Data enrichment commands
125
+ ├── run # Run enrichment
126
+ ├── plan # Create YAML config
127
+ ├── suggest # AI suggests output columns
128
+ └── deploy # Deploy to cloud systems (requires pip install)
129
+ ```
130
+
131
+ ## Quick Start
132
+
133
+ ### 1. Authenticate
134
+
135
+ ```bash
136
+ # Interactive OAuth login
137
+ parallel-cli login
138
+
139
+ # Or set environment variable
140
+ export PARALLEL_API_KEY=your_api_key
141
+ ```
142
+
143
+ ### 2. Search the Web
144
+
145
+ ```bash
146
+ # Natural language search
147
+ parallel-cli search "What is Anthropic's latest AI model?" --json
148
+
149
+ # Keyword search with filters
150
+ parallel-cli search -q "bitcoin price" --after-date 2024-01-01 --json
151
+
152
+ # Search specific domains
153
+ parallel-cli search "SEC filings for Apple" --include-domains sec.gov --json
154
+ ```
155
+
156
+ ### 3. Extract Content from URLs
157
+
158
+ ```bash
159
+ # Extract content as markdown
160
+ parallel-cli extract https://example.com --json
161
+
162
+ # Extract with a specific focus
163
+ parallel-cli extract https://company.com --objective "Find pricing info" --json
164
+
165
+ # Get full page content
166
+ parallel-cli extract https://example.com --full-content --json
167
+ ```
168
+
169
+ ### 4. Enrich Data
170
+
171
+ ```bash
172
+ # Let AI suggest what columns to add
173
+ parallel-cli enrich suggest "Find the CEO and annual revenue" --json
174
+
175
+ # Create a config file (interactive)
176
+ parallel-cli enrich plan -o config.yaml
177
+
178
+ # Create a config file (non-interactive, for AI agents)
179
+ parallel-cli enrich plan -o config.yaml \
180
+ --source-type csv \
181
+ --source companies.csv \
182
+ --target enriched.csv \
183
+ --source-columns '[{"name": "company", "description": "Company name"}]' \
184
+ --intent "Find the CEO and annual revenue"
185
+
186
+ # Run enrichment from config
187
+ parallel-cli enrich run config.yaml
188
+
189
+ # Run enrichment directly (no config file needed)
190
+ parallel-cli enrich run \
191
+ --source-type csv \
192
+ --source companies.csv \
193
+ --target enriched.csv \
194
+ --source-columns '[{"name": "company", "description": "Company name"}]' \
195
+ --intent "Find the CEO and annual revenue"
196
+ ```
197
+
198
+ ### 5. Deploy to Cloud Systems
199
+
200
+ ```bash
201
+ # Deploy to BigQuery for SQL-native enrichment
202
+ parallel-cli enrich deploy --system bigquery --project my-gcp-project
203
+ ```
204
+
205
+ ## Non-Interactive Mode (for AI Agents & Scripts)
206
+
207
+ All commands support `--json` output and can be fully controlled via CLI arguments:
208
+
209
+ ```bash
210
+ # Search with JSON output
211
+ parallel-cli search "query" --json
212
+
213
+ # Extract with JSON output
214
+ parallel-cli extract https://url.com --json
215
+
216
+ # Suggest columns with JSON output
217
+ parallel-cli enrich suggest "Find CEO" --json
218
+
219
+ # Plan without prompts (provide all args)
220
+ parallel-cli enrich plan -o config.yaml \
221
+ --source-type csv \
222
+ --source input.csv \
223
+ --target output.csv \
224
+ --source-columns '[{"name": "company", "description": "Company name"}]' \
225
+ --enriched-columns '[{"name": "ceo", "description": "CEO name"}]'
226
+
227
+ # Or use --intent to let AI determine the columns
228
+ parallel-cli enrich plan -o config.yaml \
229
+ --source-type csv \
230
+ --source input.csv \
231
+ --target output.csv \
232
+ --source-columns '[{"name": "company", "description": "Company name"}]' \
233
+ --intent "Find CEO, revenue, and headquarters"
234
+ ```
235
+
236
+ ## Integrations
237
+
238
+ | Integration | Type | Install | Documentation |
239
+ |-------------|------|---------|---------------|
240
+ | **Polars** | Python DataFrame | `pip install parallel-web-tools[polars]` | [Setup Guide](docs/polars-setup.md) |
241
+ | **DuckDB** | SQL + Python | `pip install parallel-web-tools[duckdb]` | [Setup Guide](docs/duckdb-setup.md) |
242
+ | **Snowflake** | SQL UDF | `pip install parallel-web-tools[snowflake]` | [Setup Guide](docs/snowflake-setup.md) |
243
+ | **BigQuery** | Cloud Function | `pip install parallel-web-tools[bigquery]` | [Setup Guide](docs/bigquery-setup.md) |
244
+ | **Spark** | SQL UDF | `pip install parallel-web-tools[spark]` | [Demo Notebook](notebooks/spark_enrichment_demo.ipynb) |
245
+
246
+ ### Quick Integration Examples
247
+
248
+ **Polars:**
249
+ ```python
250
+ import polars as pl
251
+ from parallel_web_tools.integrations.polars import parallel_enrich
252
+
253
+ df = pl.DataFrame({"company": ["Google", "Microsoft"]})
254
+ result = parallel_enrich(
255
+ df,
256
+ input_columns={"company_name": "company"},
257
+ output_columns=["CEO name", "Founding year"],
258
+ )
259
+ print(result.result)
260
+ ```
261
+
262
+ **DuckDB:**
263
+ ```python
264
+ import duckdb
265
+ from parallel_web_tools.integrations.duckdb import enrich_table
266
+
267
+ conn = duckdb.connect()
268
+ conn.execute("CREATE TABLE companies AS SELECT 'Google' as name")
269
+ result = enrich_table(
270
+ conn,
271
+ source_table="companies",
272
+ input_columns={"company_name": "name"},
273
+ output_columns=["CEO name", "Founding year"],
274
+ )
275
+ print(result.result.fetchdf())
276
+ ```
277
+
278
+ ## Programmatic Usage
279
+
280
+ ```python
281
+ from parallel_web_tools import run_enrichment, run_enrichment_from_dict
282
+
283
+ # From YAML file
284
+ run_enrichment("config.yaml")
285
+
286
+ # From dictionary
287
+ run_enrichment_from_dict({
288
+ "source": "data.csv",
289
+ "target": "enriched.csv",
290
+ "source_type": "csv",
291
+ "source_columns": [{"name": "company", "description": "Company name"}],
292
+ "enriched_columns": [{"name": "ceo", "description": "CEO name"}]
293
+ })
294
+ ```
295
+
296
+ ## YAML Configuration Format
297
+
298
+ ```yaml
299
+ source: input.csv
300
+ target: output.csv
301
+ source_type: csv # csv, duckdb, or bigquery
302
+ processor: core-fast # lite, base, core, pro, ultra (add -fast for speed)
303
+
304
+ source_columns:
305
+ - name: company_name
306
+ description: The name of the company
307
+
308
+ enriched_columns:
309
+ - name: ceo
310
+ description: The CEO of the company
311
+ type: str # str, int, float, bool
312
+ - name: revenue
313
+ description: Annual revenue in USD
314
+ type: float
315
+ ```
316
+
317
+ ## Environment Variables
318
+
319
+ | Variable | Description |
320
+ |----------|-------------|
321
+ | `PARALLEL_API_KEY` | API key for authentication (alternative to `parallel-cli login`) |
322
+ | `DUCKDB_FILE` | Default DuckDB file path |
323
+ | `BIGQUERY_PROJECT` | Default BigQuery project ID |
324
+
325
+ ## Related Packages
326
+
327
+ - [`parallel-web`](https://github.com/parallel-web/parallel-sdk-python) - Official Parallel Python SDK (this package depends on it)
328
+
329
+ ## Development
330
+
331
+ ```bash
332
+ git clone https://github.com/parallel-web/parallel-web-tools.git
333
+ cd parallel-web-tools
334
+ uv sync --all-extras
335
+ uv run pytest tests/ -v
336
+ ```
337
+
338
+ ## License
339
+
340
+ MIT
@@ -0,0 +1,277 @@
1
+ # Parallel-Web-Tools
2
+
3
+ CLI and data enrichment utilities for the [Parallel API](https://docs.parallel.ai).
4
+
5
+ > **Note:** This package provides the `parallel-cli` command-line tool and data enrichment utilities in the `parallel-web-tools` package.
6
+ > It depends on [`parallel-web`](https://github.com/parallel-web/parallel-sdk-python), the official
7
+ > Parallel Python SDK, but does not contain it. Install `parallel-web` separately if you need
8
+ > direct SDK access.
9
+
10
+ ## Features
11
+
12
+ - **CLI for Humans & AI Agents** - Works interactively or fully via command-line arguments
13
+ - **Web Search** - AI-powered search with domain filtering and date ranges
14
+ - **Content Extraction** - Extract clean markdown from any URL
15
+ - **Data Enrichment** - Enrich CSV, DuckDB, and BigQuery data with AI
16
+ - **AI-Assisted Planning** - Use natural language to define what data you want
17
+ - **Multiple Integrations** - Polars, DuckDB, Snowflake, BigQuery, Spark
18
+
19
+ ## Installation
20
+
21
+ ### Standalone CLI (Recommended)
22
+
23
+ Install the standalone `parallel-cli` binary for search, extract, enrichment, and deep research (no Python required):
24
+
25
+ ```bash
26
+ curl -fsSL https://raw.githubusercontent.com/parallel-web/parallel-web-tools/main/install-cli.sh | bash
27
+ ```
28
+
29
+ This automatically detects your platform (macOS/Linux, x64/arm64) and installs to `~/.local/bin`.
30
+
31
+ > **Note:** The standalone binary includes core CLI features. For deployment commands (`enrich deploy`), use pip: `pip install parallel-web-tools[snowflake]` or `[bigquery]`.
32
+
33
+ ### Python Package
34
+
35
+ For programmatic usage or data enrichment integrations:
36
+
37
+ ```bash
38
+ # Full install with CLI and all connectors
39
+ pip install parallel-web-tools[all]
40
+
41
+ # Library only (minimal dependencies)
42
+ pip install parallel-web-tools
43
+
44
+ # With specific connectors
45
+ pip install parallel-web-tools[cli] # CLI only
46
+ pip install parallel-web-tools[polars] # Polars DataFrame
47
+ pip install parallel-web-tools[duckdb] # DuckDB
48
+ pip install parallel-web-tools[bigquery] # BigQuery
49
+ pip install parallel-web-tools[spark] # Apache Spark
50
+ ```
51
+
52
+ ## CLI Overview
53
+
54
+ ```
55
+ parallel-cli
56
+ ├── auth # Check authentication status
57
+ ├── login # OAuth login (or use PARALLEL_API_KEY env var)
58
+ ├── logout # Remove stored credentials
59
+ ├── search # Web search
60
+ ├── extract # Extract content from URLs
61
+ └── enrich # Data enrichment commands
62
+ ├── run # Run enrichment
63
+ ├── plan # Create YAML config
64
+ ├── suggest # AI suggests output columns
65
+ └── deploy # Deploy to cloud systems (requires pip install)
66
+ ```
67
+
68
+ ## Quick Start
69
+
70
+ ### 1. Authenticate
71
+
72
+ ```bash
73
+ # Interactive OAuth login
74
+ parallel-cli login
75
+
76
+ # Or set environment variable
77
+ export PARALLEL_API_KEY=your_api_key
78
+ ```
79
+
80
+ ### 2. Search the Web
81
+
82
+ ```bash
83
+ # Natural language search
84
+ parallel-cli search "What is Anthropic's latest AI model?" --json
85
+
86
+ # Keyword search with filters
87
+ parallel-cli search -q "bitcoin price" --after-date 2024-01-01 --json
88
+
89
+ # Search specific domains
90
+ parallel-cli search "SEC filings for Apple" --include-domains sec.gov --json
91
+ ```
92
+
93
+ ### 3. Extract Content from URLs
94
+
95
+ ```bash
96
+ # Extract content as markdown
97
+ parallel-cli extract https://example.com --json
98
+
99
+ # Extract with a specific focus
100
+ parallel-cli extract https://company.com --objective "Find pricing info" --json
101
+
102
+ # Get full page content
103
+ parallel-cli extract https://example.com --full-content --json
104
+ ```
105
+
106
+ ### 4. Enrich Data
107
+
108
+ ```bash
109
+ # Let AI suggest what columns to add
110
+ parallel-cli enrich suggest "Find the CEO and annual revenue" --json
111
+
112
+ # Create a config file (interactive)
113
+ parallel-cli enrich plan -o config.yaml
114
+
115
+ # Create a config file (non-interactive, for AI agents)
116
+ parallel-cli enrich plan -o config.yaml \
117
+ --source-type csv \
118
+ --source companies.csv \
119
+ --target enriched.csv \
120
+ --source-columns '[{"name": "company", "description": "Company name"}]' \
121
+ --intent "Find the CEO and annual revenue"
122
+
123
+ # Run enrichment from config
124
+ parallel-cli enrich run config.yaml
125
+
126
+ # Run enrichment directly (no config file needed)
127
+ parallel-cli enrich run \
128
+ --source-type csv \
129
+ --source companies.csv \
130
+ --target enriched.csv \
131
+ --source-columns '[{"name": "company", "description": "Company name"}]' \
132
+ --intent "Find the CEO and annual revenue"
133
+ ```
134
+
135
+ ### 5. Deploy to Cloud Systems
136
+
137
+ ```bash
138
+ # Deploy to BigQuery for SQL-native enrichment
139
+ parallel-cli enrich deploy --system bigquery --project my-gcp-project
140
+ ```
141
+
142
+ ## Non-Interactive Mode (for AI Agents & Scripts)
143
+
144
+ All commands support `--json` output and can be fully controlled via CLI arguments:
145
+
146
+ ```bash
147
+ # Search with JSON output
148
+ parallel-cli search "query" --json
149
+
150
+ # Extract with JSON output
151
+ parallel-cli extract https://url.com --json
152
+
153
+ # Suggest columns with JSON output
154
+ parallel-cli enrich suggest "Find CEO" --json
155
+
156
+ # Plan without prompts (provide all args)
157
+ parallel-cli enrich plan -o config.yaml \
158
+ --source-type csv \
159
+ --source input.csv \
160
+ --target output.csv \
161
+ --source-columns '[{"name": "company", "description": "Company name"}]' \
162
+ --enriched-columns '[{"name": "ceo", "description": "CEO name"}]'
163
+
164
+ # Or use --intent to let AI determine the columns
165
+ parallel-cli enrich plan -o config.yaml \
166
+ --source-type csv \
167
+ --source input.csv \
168
+ --target output.csv \
169
+ --source-columns '[{"name": "company", "description": "Company name"}]' \
170
+ --intent "Find CEO, revenue, and headquarters"
171
+ ```
172
+
173
+ ## Integrations
174
+
175
+ | Integration | Type | Install | Documentation |
176
+ |-------------|------|---------|---------------|
177
+ | **Polars** | Python DataFrame | `pip install parallel-web-tools[polars]` | [Setup Guide](docs/polars-setup.md) |
178
+ | **DuckDB** | SQL + Python | `pip install parallel-web-tools[duckdb]` | [Setup Guide](docs/duckdb-setup.md) |
179
+ | **Snowflake** | SQL UDF | `pip install parallel-web-tools[snowflake]` | [Setup Guide](docs/snowflake-setup.md) |
180
+ | **BigQuery** | Cloud Function | `pip install parallel-web-tools[bigquery]` | [Setup Guide](docs/bigquery-setup.md) |
181
+ | **Spark** | SQL UDF | `pip install parallel-web-tools[spark]` | [Demo Notebook](notebooks/spark_enrichment_demo.ipynb) |
182
+
183
+ ### Quick Integration Examples
184
+
185
+ **Polars:**
186
+ ```python
187
+ import polars as pl
188
+ from parallel_web_tools.integrations.polars import parallel_enrich
189
+
190
+ df = pl.DataFrame({"company": ["Google", "Microsoft"]})
191
+ result = parallel_enrich(
192
+ df,
193
+ input_columns={"company_name": "company"},
194
+ output_columns=["CEO name", "Founding year"],
195
+ )
196
+ print(result.result)
197
+ ```
198
+
199
+ **DuckDB:**
200
+ ```python
201
+ import duckdb
202
+ from parallel_web_tools.integrations.duckdb import enrich_table
203
+
204
+ conn = duckdb.connect()
205
+ conn.execute("CREATE TABLE companies AS SELECT 'Google' as name")
206
+ result = enrich_table(
207
+ conn,
208
+ source_table="companies",
209
+ input_columns={"company_name": "name"},
210
+ output_columns=["CEO name", "Founding year"],
211
+ )
212
+ print(result.result.fetchdf())
213
+ ```
214
+
215
+ ## Programmatic Usage
216
+
217
+ ```python
218
+ from parallel_web_tools import run_enrichment, run_enrichment_from_dict
219
+
220
+ # From YAML file
221
+ run_enrichment("config.yaml")
222
+
223
+ # From dictionary
224
+ run_enrichment_from_dict({
225
+ "source": "data.csv",
226
+ "target": "enriched.csv",
227
+ "source_type": "csv",
228
+ "source_columns": [{"name": "company", "description": "Company name"}],
229
+ "enriched_columns": [{"name": "ceo", "description": "CEO name"}]
230
+ })
231
+ ```
232
+
233
+ ## YAML Configuration Format
234
+
235
+ ```yaml
236
+ source: input.csv
237
+ target: output.csv
238
+ source_type: csv # csv, duckdb, or bigquery
239
+ processor: core-fast # lite, base, core, pro, ultra (add -fast for speed)
240
+
241
+ source_columns:
242
+ - name: company_name
243
+ description: The name of the company
244
+
245
+ enriched_columns:
246
+ - name: ceo
247
+ description: The CEO of the company
248
+ type: str # str, int, float, bool
249
+ - name: revenue
250
+ description: Annual revenue in USD
251
+ type: float
252
+ ```
253
+
254
+ ## Environment Variables
255
+
256
+ | Variable | Description |
257
+ |----------|-------------|
258
+ | `PARALLEL_API_KEY` | API key for authentication (alternative to `parallel-cli login`) |
259
+ | `DUCKDB_FILE` | Default DuckDB file path |
260
+ | `BIGQUERY_PROJECT` | Default BigQuery project ID |
261
+
262
+ ## Related Packages
263
+
264
+ - [`parallel-web`](https://github.com/parallel-web/parallel-sdk-python) - Official Parallel Python SDK (this package depends on it)
265
+
266
+ ## Development
267
+
268
+ ```bash
269
+ git clone https://github.com/parallel-web/parallel-web-tools.git
270
+ cd parallel-web-tools
271
+ uv sync --all-extras
272
+ uv run pytest tests/ -v
273
+ ```
274
+
275
+ ## License
276
+
277
+ MIT
@@ -0,0 +1,56 @@
1
+ """Parallel Data Enrichment package."""
2
+
3
+ # Re-export everything from core for convenience
4
+ from parallel_web_tools.core import (
5
+ # Schema
6
+ AVAILABLE_PROCESSORS,
7
+ Column,
8
+ InputSchema,
9
+ ParseError,
10
+ ProcessorType,
11
+ SourceType,
12
+ # Batch
13
+ enrich_batch,
14
+ enrich_single,
15
+ # Auth
16
+ get_api_key,
17
+ get_async_client,
18
+ get_auth_status,
19
+ get_client,
20
+ load_schema,
21
+ logout,
22
+ parse_input_and_output_models,
23
+ parse_schema,
24
+ # Runner
25
+ run_enrichment,
26
+ run_enrichment_from_dict,
27
+ run_tasks,
28
+ )
29
+
30
+ __version__ = "0.0.5"
31
+
32
+ __all__ = [
33
+ # Auth
34
+ "get_api_key",
35
+ "get_auth_status",
36
+ "get_client",
37
+ "get_async_client",
38
+ "logout",
39
+ # Schema
40
+ "AVAILABLE_PROCESSORS",
41
+ "Column",
42
+ "InputSchema",
43
+ "ParseError",
44
+ "ProcessorType",
45
+ "SourceType",
46
+ "load_schema",
47
+ "parse_schema",
48
+ "parse_input_and_output_models",
49
+ # Batch
50
+ "enrich_batch",
51
+ "enrich_single",
52
+ "run_tasks",
53
+ # Runner
54
+ "run_enrichment",
55
+ "run_enrichment_from_dict",
56
+ ]
@@ -0,0 +1,5 @@
1
+ """CLI for Parallel Data."""
2
+
3
+ from parallel_web_tools.cli.commands import main
4
+
5
+ __all__ = ["main"]