parallel-web-tools 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parallel_web_tools/__init__.py +56 -0
- parallel_web_tools/cli/__init__.py +5 -0
- parallel_web_tools/cli/commands.py +626 -0
- parallel_web_tools/cli/planner.py +438 -0
- parallel_web_tools/core/__init__.py +70 -0
- parallel_web_tools/core/auth.py +256 -0
- parallel_web_tools/core/batch.py +307 -0
- parallel_web_tools/core/result.py +29 -0
- parallel_web_tools/core/runner.py +75 -0
- parallel_web_tools/core/schema.py +169 -0
- parallel_web_tools/integrations/__init__.py +17 -0
- parallel_web_tools/integrations/bigquery/__init__.py +34 -0
- parallel_web_tools/integrations/bigquery/cloud_function/main.py +199 -0
- parallel_web_tools/integrations/bigquery/cloud_function/requirements.txt +5 -0
- parallel_web_tools/integrations/bigquery/deploy.py +456 -0
- parallel_web_tools/integrations/bigquery/sql/create_functions.sql +49 -0
- parallel_web_tools/integrations/duckdb/__init__.py +63 -0
- parallel_web_tools/integrations/duckdb/batch.py +220 -0
- parallel_web_tools/integrations/duckdb/udf.py +159 -0
- parallel_web_tools/integrations/polars/__init__.py +37 -0
- parallel_web_tools/integrations/polars/enrich.py +218 -0
- parallel_web_tools/integrations/snowflake/__init__.py +46 -0
- parallel_web_tools/integrations/snowflake/deploy.py +347 -0
- parallel_web_tools/integrations/snowflake/sql/01_setup.sql +97 -0
- parallel_web_tools/integrations/snowflake/sql/02_create_udf.sql +260 -0
- parallel_web_tools/integrations/snowflake/sql/03_cleanup.sql +61 -0
- parallel_web_tools/integrations/spark/__init__.py +57 -0
- parallel_web_tools/integrations/spark/streaming.py +404 -0
- parallel_web_tools/integrations/spark/udf.py +206 -0
- parallel_web_tools/integrations/utils.py +32 -0
- parallel_web_tools/processors/__init__.py +20 -0
- parallel_web_tools/processors/bigquery.py +70 -0
- parallel_web_tools/processors/csv.py +32 -0
- parallel_web_tools/processors/duckdb.py +25 -0
- parallel_web_tools-0.0.1.dist-info/METADATA +346 -0
- parallel_web_tools-0.0.1.dist-info/RECORD +38 -0
- parallel_web_tools-0.0.1.dist-info/WHEEL +4 -0
- parallel_web_tools-0.0.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Parallel Data Enrichment package."""
|
|
2
|
+
|
|
3
|
+
# Re-export everything from core for convenience
|
|
4
|
+
from parallel_web_tools.core import (
|
|
5
|
+
# Schema
|
|
6
|
+
AVAILABLE_PROCESSORS,
|
|
7
|
+
Column,
|
|
8
|
+
InputSchema,
|
|
9
|
+
ParseError,
|
|
10
|
+
ProcessorType,
|
|
11
|
+
SourceType,
|
|
12
|
+
# Batch
|
|
13
|
+
enrich_batch,
|
|
14
|
+
enrich_single,
|
|
15
|
+
# Auth
|
|
16
|
+
get_api_key,
|
|
17
|
+
get_async_client,
|
|
18
|
+
get_auth_status,
|
|
19
|
+
get_client,
|
|
20
|
+
load_schema,
|
|
21
|
+
logout,
|
|
22
|
+
parse_input_and_output_models,
|
|
23
|
+
parse_schema,
|
|
24
|
+
# Runner
|
|
25
|
+
run_enrichment,
|
|
26
|
+
run_enrichment_from_dict,
|
|
27
|
+
run_tasks,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__version__ = "0.0.1"
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
# Auth
|
|
34
|
+
"get_api_key",
|
|
35
|
+
"get_auth_status",
|
|
36
|
+
"get_client",
|
|
37
|
+
"get_async_client",
|
|
38
|
+
"logout",
|
|
39
|
+
# Schema
|
|
40
|
+
"AVAILABLE_PROCESSORS",
|
|
41
|
+
"Column",
|
|
42
|
+
"InputSchema",
|
|
43
|
+
"ParseError",
|
|
44
|
+
"ProcessorType",
|
|
45
|
+
"SourceType",
|
|
46
|
+
"load_schema",
|
|
47
|
+
"parse_schema",
|
|
48
|
+
"parse_input_and_output_models",
|
|
49
|
+
# Batch
|
|
50
|
+
"enrich_batch",
|
|
51
|
+
"enrich_single",
|
|
52
|
+
"run_tasks",
|
|
53
|
+
# Runner
|
|
54
|
+
"run_enrichment",
|
|
55
|
+
"run_enrichment_from_dict",
|
|
56
|
+
]
|
|
@@ -0,0 +1,626 @@
|
|
|
1
|
+
"""CLI commands for Parallel."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
import httpx
|
|
10
|
+
from dotenv import load_dotenv
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
|
|
13
|
+
from parallel_web_tools import __version__
|
|
14
|
+
from parallel_web_tools.cli.planner import create_config_interactive, save_config
|
|
15
|
+
from parallel_web_tools.core import (
|
|
16
|
+
AVAILABLE_PROCESSORS,
|
|
17
|
+
JSON_SCHEMA_TYPE_MAP,
|
|
18
|
+
get_api_key,
|
|
19
|
+
get_auth_status,
|
|
20
|
+
logout,
|
|
21
|
+
run_enrichment,
|
|
22
|
+
run_enrichment_from_dict,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
console = Console()
|
|
28
|
+
|
|
29
|
+
load_dotenv(".env.local")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def parse_columns(columns_json: str | None) -> list[dict[str, str]] | None:
|
|
33
|
+
"""Parse columns from JSON string."""
|
|
34
|
+
if not columns_json:
|
|
35
|
+
return None
|
|
36
|
+
try:
|
|
37
|
+
columns = json.loads(columns_json)
|
|
38
|
+
if not isinstance(columns, list):
|
|
39
|
+
raise click.BadParameter("Columns must be a JSON array")
|
|
40
|
+
for col in columns:
|
|
41
|
+
if "name" not in col:
|
|
42
|
+
raise click.BadParameter("Each column must have a 'name' field")
|
|
43
|
+
if "description" not in col:
|
|
44
|
+
raise click.BadParameter("Each column must have a 'description' field")
|
|
45
|
+
return columns
|
|
46
|
+
except json.JSONDecodeError as e:
|
|
47
|
+
raise click.BadParameter(f"Invalid JSON: {e}") from e
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def validate_enrich_args(
|
|
51
|
+
source_type: str | None,
|
|
52
|
+
source: str | None,
|
|
53
|
+
target: str | None,
|
|
54
|
+
source_columns: str | None,
|
|
55
|
+
enriched_columns: str | None,
|
|
56
|
+
intent: str | None,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""Validate enrichment CLI arguments.
|
|
59
|
+
|
|
60
|
+
Raises click.Abort with appropriate error messages for invalid combinations.
|
|
61
|
+
"""
|
|
62
|
+
if enriched_columns and intent:
|
|
63
|
+
console.print("[bold red]Error: Use either --enriched-columns OR --intent, not both.[/bold red]")
|
|
64
|
+
raise click.Abort()
|
|
65
|
+
|
|
66
|
+
base_args = [source_type, source, target, source_columns]
|
|
67
|
+
has_base = all(arg is not None for arg in base_args)
|
|
68
|
+
has_output_spec = enriched_columns is not None or intent is not None
|
|
69
|
+
|
|
70
|
+
if any(arg is not None for arg in base_args) or has_output_spec:
|
|
71
|
+
if not has_base:
|
|
72
|
+
missing = [
|
|
73
|
+
n
|
|
74
|
+
for n, v in [
|
|
75
|
+
("--source-type", source_type),
|
|
76
|
+
("--source", source),
|
|
77
|
+
("--target", target),
|
|
78
|
+
("--source-columns", source_columns),
|
|
79
|
+
]
|
|
80
|
+
if not v
|
|
81
|
+
]
|
|
82
|
+
console.print(f"[bold red]Error: Missing required options: {', '.join(missing)}[/bold red]")
|
|
83
|
+
raise click.Abort()
|
|
84
|
+
if not has_output_spec:
|
|
85
|
+
console.print("[bold red]Error: Provide --enriched-columns OR --intent.[/bold red]")
|
|
86
|
+
raise click.Abort()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def build_config_from_args(
|
|
90
|
+
source_type: str,
|
|
91
|
+
source: str,
|
|
92
|
+
target: str,
|
|
93
|
+
source_columns: list[dict[str, str]],
|
|
94
|
+
enriched_columns: list[dict[str, str]],
|
|
95
|
+
processor: str,
|
|
96
|
+
) -> dict[str, Any]:
|
|
97
|
+
"""Build configuration dict from CLI arguments."""
|
|
98
|
+
return {
|
|
99
|
+
"source_type": source_type,
|
|
100
|
+
"source": source,
|
|
101
|
+
"target": target,
|
|
102
|
+
"source_columns": source_columns,
|
|
103
|
+
"enriched_columns": enriched_columns,
|
|
104
|
+
"processor": processor,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def suggest_from_intent(
|
|
109
|
+
intent: str,
|
|
110
|
+
source_columns: list[dict[str, str]] | None = None,
|
|
111
|
+
) -> dict[str, Any]:
|
|
112
|
+
"""Use Parallel Ingest API to suggest output columns and processor."""
|
|
113
|
+
api_key = get_api_key()
|
|
114
|
+
base_url = "https://api.parallel.ai"
|
|
115
|
+
headers = {"x-api-key": api_key, "Content-Type": "application/json"}
|
|
116
|
+
|
|
117
|
+
full_intent = intent
|
|
118
|
+
if source_columns:
|
|
119
|
+
col_descriptions = [f"- {col['name']}: {col.get('description', 'no description')}" for col in source_columns]
|
|
120
|
+
full_intent = f"{intent}\n\nInput columns available:\n" + "\n".join(col_descriptions)
|
|
121
|
+
|
|
122
|
+
suggest_body: dict[str, Any] = {"user_intent": full_intent}
|
|
123
|
+
|
|
124
|
+
with httpx.Client(timeout=60) as client:
|
|
125
|
+
response = client.post(f"{base_url}/v1beta/tasks/suggest", json=suggest_body, headers=headers)
|
|
126
|
+
response.raise_for_status()
|
|
127
|
+
data = response.json()
|
|
128
|
+
|
|
129
|
+
output_schema = data.get("output_schema", {})
|
|
130
|
+
properties = output_schema.get("properties", {})
|
|
131
|
+
|
|
132
|
+
enriched_columns = []
|
|
133
|
+
for name, prop in properties.items():
|
|
134
|
+
col_type = prop.get("type", "string")
|
|
135
|
+
mapped_type = JSON_SCHEMA_TYPE_MAP.get(col_type, "str")
|
|
136
|
+
enriched_columns.append({"name": name, "description": prop.get("description", ""), "type": mapped_type})
|
|
137
|
+
|
|
138
|
+
processor = "core-fast"
|
|
139
|
+
try:
|
|
140
|
+
input_schema = data.get("input_schema", {"type": "object", "properties": {}})
|
|
141
|
+
task_spec = {"input_schema": input_schema, "output_schema": output_schema}
|
|
142
|
+
|
|
143
|
+
with httpx.Client(timeout=60) as client:
|
|
144
|
+
processor_response = client.post(
|
|
145
|
+
f"{base_url}/v1beta/tasks/suggest-processor", json={"task_spec": task_spec}, headers=headers
|
|
146
|
+
)
|
|
147
|
+
if processor_response.status_code == 200:
|
|
148
|
+
processor_data = processor_response.json()
|
|
149
|
+
recommended = processor_data.get("recommended_processors", [])
|
|
150
|
+
if recommended:
|
|
151
|
+
processor = recommended[0]
|
|
152
|
+
except Exception:
|
|
153
|
+
pass
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
"enriched_columns": enriched_columns,
|
|
157
|
+
"processor": processor,
|
|
158
|
+
"title": data.get("title", ""),
|
|
159
|
+
"warnings": data.get("warnings", []),
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# =============================================================================
|
|
164
|
+
# Main CLI Group
|
|
165
|
+
# =============================================================================
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@click.group()
|
|
169
|
+
@click.version_option(version=__version__, prog_name="parallel-cli")
|
|
170
|
+
def main():
|
|
171
|
+
"""Parallel CLI - AI-powered data enrichment and search."""
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
# =============================================================================
|
|
176
|
+
# Auth Commands
|
|
177
|
+
# =============================================================================
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@main.command()
|
|
181
|
+
def auth():
|
|
182
|
+
"""Check authentication status."""
|
|
183
|
+
status = get_auth_status()
|
|
184
|
+
|
|
185
|
+
if status["authenticated"]:
|
|
186
|
+
if status["method"] == "environment":
|
|
187
|
+
console.print("[green]Authenticated via PARALLEL_API_KEY environment variable[/green]")
|
|
188
|
+
else:
|
|
189
|
+
console.print("[green]Authenticated via OAuth[/green]")
|
|
190
|
+
console.print(f" Credentials: {status['token_file']}")
|
|
191
|
+
else:
|
|
192
|
+
console.print("[yellow]Not authenticated[/yellow]")
|
|
193
|
+
console.print("\n[cyan]To authenticate:[/cyan]")
|
|
194
|
+
console.print(" Run: parallel-cli login")
|
|
195
|
+
console.print(" Or set PARALLEL_API_KEY environment variable")
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
@main.command()
|
|
199
|
+
def login():
|
|
200
|
+
"""Authenticate with Parallel API."""
|
|
201
|
+
console.print("[bold cyan]Authenticating with Parallel...[/bold cyan]\n")
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
get_api_key(force_login=True)
|
|
205
|
+
console.print("\n[bold green]Authentication successful![/bold green]")
|
|
206
|
+
except Exception as e:
|
|
207
|
+
console.print(f"[bold red]Authentication failed: {e}[/bold red]")
|
|
208
|
+
raise click.Abort() from None
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@main.command(name="logout")
|
|
212
|
+
def logout_cmd():
|
|
213
|
+
"""Remove stored credentials."""
|
|
214
|
+
if logout():
|
|
215
|
+
console.print("[green]Logged out successfully[/green]")
|
|
216
|
+
else:
|
|
217
|
+
console.print("[yellow]No stored credentials found[/yellow]")
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
# =============================================================================
|
|
221
|
+
# Search Command
|
|
222
|
+
# =============================================================================
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
@main.command()
|
|
226
|
+
@click.argument("objective", required=False)
|
|
227
|
+
@click.option("-q", "--query", multiple=True, help="Keyword search query (can be repeated)")
|
|
228
|
+
@click.option(
|
|
229
|
+
"--mode", type=click.Choice(["one-shot", "agentic"]), default="one-shot", help="Search mode", show_default=True
|
|
230
|
+
)
|
|
231
|
+
@click.option("--max-results", type=int, default=10, help="Maximum results", show_default=True)
|
|
232
|
+
@click.option("--include-domains", multiple=True, help="Only search these domains")
|
|
233
|
+
@click.option("--exclude-domains", multiple=True, help="Exclude these domains")
|
|
234
|
+
@click.option("--after-date", help="Only results after this date (YYYY-MM-DD)")
|
|
235
|
+
@click.option("--json", "output_json", is_flag=True, help="Output as JSON")
|
|
236
|
+
def search(
|
|
237
|
+
objective: str | None,
|
|
238
|
+
query: tuple[str, ...],
|
|
239
|
+
mode: str,
|
|
240
|
+
max_results: int,
|
|
241
|
+
include_domains: tuple[str, ...],
|
|
242
|
+
exclude_domains: tuple[str, ...],
|
|
243
|
+
after_date: str | None,
|
|
244
|
+
output_json: bool,
|
|
245
|
+
):
|
|
246
|
+
"""Search the web using Parallel's AI-powered search."""
|
|
247
|
+
if not objective and not query:
|
|
248
|
+
console.print("[bold red]Error: Provide an objective or at least one --query.[/bold red]")
|
|
249
|
+
raise click.Abort()
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
from parallel import Parallel
|
|
253
|
+
|
|
254
|
+
api_key = get_api_key()
|
|
255
|
+
client = Parallel(api_key=api_key)
|
|
256
|
+
|
|
257
|
+
search_kwargs: dict[str, Any] = {"mode": mode, "max_results": max_results}
|
|
258
|
+
if objective:
|
|
259
|
+
search_kwargs["objective"] = objective
|
|
260
|
+
if query:
|
|
261
|
+
search_kwargs["search_queries"] = list(query)
|
|
262
|
+
|
|
263
|
+
source_policy: dict[str, Any] = {}
|
|
264
|
+
if include_domains:
|
|
265
|
+
source_policy["include_domains"] = list(include_domains)
|
|
266
|
+
if exclude_domains:
|
|
267
|
+
source_policy["exclude_domains"] = list(exclude_domains)
|
|
268
|
+
if after_date:
|
|
269
|
+
source_policy["after_date"] = after_date
|
|
270
|
+
if source_policy:
|
|
271
|
+
search_kwargs["source_policy"] = source_policy
|
|
272
|
+
|
|
273
|
+
if not output_json:
|
|
274
|
+
console.print("[dim]Searching...[/dim]\n")
|
|
275
|
+
|
|
276
|
+
result = client.beta.search(**search_kwargs)
|
|
277
|
+
|
|
278
|
+
if output_json:
|
|
279
|
+
output = {
|
|
280
|
+
"search_id": result.search_id,
|
|
281
|
+
"results": [
|
|
282
|
+
{"url": r.url, "title": r.title, "publish_date": r.publish_date, "excerpts": r.excerpts}
|
|
283
|
+
for r in result.results
|
|
284
|
+
],
|
|
285
|
+
"warnings": result.warnings if hasattr(result, "warnings") else [],
|
|
286
|
+
}
|
|
287
|
+
print(json.dumps(output, indent=2))
|
|
288
|
+
else:
|
|
289
|
+
console.print(f"[bold green]Found {len(result.results)} results[/bold green]\n")
|
|
290
|
+
for i, r in enumerate(result.results, 1):
|
|
291
|
+
console.print(f"[bold cyan]{i}. {r.title}[/bold cyan]")
|
|
292
|
+
console.print(f" [link={r.url}]{r.url}[/link]")
|
|
293
|
+
if r.publish_date:
|
|
294
|
+
console.print(f" [dim]Published: {r.publish_date}[/dim]")
|
|
295
|
+
if r.excerpts:
|
|
296
|
+
excerpt = r.excerpts[0][:200] + "..." if len(r.excerpts[0]) > 200 else r.excerpts[0]
|
|
297
|
+
console.print(f" [dim]{excerpt}[/dim]")
|
|
298
|
+
console.print()
|
|
299
|
+
|
|
300
|
+
except Exception as e:
|
|
301
|
+
console.print(f"[bold red]Error: {e}[/bold red]")
|
|
302
|
+
raise click.Abort() from None
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
# =============================================================================
|
|
306
|
+
# Extract Command
|
|
307
|
+
# =============================================================================
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
@main.command()
|
|
311
|
+
@click.argument("urls", nargs=-1, required=True)
|
|
312
|
+
@click.option("--objective", help="Focus extraction on a specific goal")
|
|
313
|
+
@click.option("-q", "--query", multiple=True, help="Keywords to prioritize (can be repeated)")
|
|
314
|
+
@click.option("--full-content", is_flag=True, help="Include complete page content")
|
|
315
|
+
@click.option("--no-excerpts", is_flag=True, help="Exclude excerpts from output")
|
|
316
|
+
@click.option("--json", "output_json", is_flag=True, help="Output as JSON")
|
|
317
|
+
def extract(
|
|
318
|
+
urls: tuple[str, ...],
|
|
319
|
+
objective: str | None,
|
|
320
|
+
query: tuple[str, ...],
|
|
321
|
+
full_content: bool,
|
|
322
|
+
no_excerpts: bool,
|
|
323
|
+
output_json: bool,
|
|
324
|
+
):
|
|
325
|
+
"""Extract content from URLs as clean markdown."""
|
|
326
|
+
try:
|
|
327
|
+
from parallel import Parallel
|
|
328
|
+
|
|
329
|
+
api_key = get_api_key()
|
|
330
|
+
client = Parallel(api_key=api_key)
|
|
331
|
+
|
|
332
|
+
extract_kwargs: dict[str, Any] = {
|
|
333
|
+
"urls": list(urls),
|
|
334
|
+
"betas": ["search-extract-2025-10-10"],
|
|
335
|
+
"excerpts": not no_excerpts,
|
|
336
|
+
"full_content": full_content,
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
if objective:
|
|
340
|
+
extract_kwargs["objective"] = objective
|
|
341
|
+
if query:
|
|
342
|
+
extract_kwargs["search_queries"] = list(query)
|
|
343
|
+
|
|
344
|
+
if not output_json:
|
|
345
|
+
console.print(f"[dim]Extracting content from {len(urls)} URL(s)...[/dim]\n")
|
|
346
|
+
|
|
347
|
+
result = client.beta.extract(**extract_kwargs)
|
|
348
|
+
|
|
349
|
+
if output_json:
|
|
350
|
+
results_list = []
|
|
351
|
+
for r in result.results:
|
|
352
|
+
result_dict: dict[str, Any] = {"url": r.url, "title": r.title}
|
|
353
|
+
if hasattr(r, "excerpts") and r.excerpts:
|
|
354
|
+
result_dict["excerpts"] = r.excerpts
|
|
355
|
+
if hasattr(r, "full_content") and r.full_content:
|
|
356
|
+
result_dict["full_content"] = r.full_content
|
|
357
|
+
results_list.append(result_dict)
|
|
358
|
+
|
|
359
|
+
errors_list = []
|
|
360
|
+
if hasattr(result, "errors") and result.errors:
|
|
361
|
+
for e in result.errors:
|
|
362
|
+
errors_list.append(
|
|
363
|
+
{
|
|
364
|
+
"url": getattr(e, "url", None),
|
|
365
|
+
"error": str(getattr(e, "error", "")),
|
|
366
|
+
"status_code": getattr(e, "status_code", None),
|
|
367
|
+
}
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
output = {"extract_id": result.extract_id, "results": results_list, "errors": errors_list}
|
|
371
|
+
print(json.dumps(output, indent=2))
|
|
372
|
+
else:
|
|
373
|
+
if result.errors:
|
|
374
|
+
console.print(f"[yellow]Warning: {len(result.errors)} URL(s) failed[/yellow]\n")
|
|
375
|
+
|
|
376
|
+
console.print(f"[bold green]Extracted {len(result.results)} page(s)[/bold green]\n")
|
|
377
|
+
|
|
378
|
+
for r in result.results:
|
|
379
|
+
console.print(f"[bold cyan]{r.title}[/bold cyan]")
|
|
380
|
+
console.print(f"[link={r.url}]{r.url}[/link]\n")
|
|
381
|
+
|
|
382
|
+
if hasattr(r, "excerpts") and r.excerpts:
|
|
383
|
+
console.print("[dim]Excerpts:[/dim]")
|
|
384
|
+
for excerpt in r.excerpts[:3]:
|
|
385
|
+
text = excerpt[:300] + "..." if len(excerpt) > 300 else excerpt
|
|
386
|
+
console.print(f" {text}")
|
|
387
|
+
console.print()
|
|
388
|
+
|
|
389
|
+
if hasattr(r, "full_content") and r.full_content:
|
|
390
|
+
console.print("[dim]Full content:[/dim]")
|
|
391
|
+
content = r.full_content[:1000] + "..." if len(r.full_content) > 1000 else r.full_content
|
|
392
|
+
console.print(content)
|
|
393
|
+
console.print()
|
|
394
|
+
|
|
395
|
+
except Exception as e:
|
|
396
|
+
console.print(f"[bold red]Error: {e}[/bold red]")
|
|
397
|
+
raise click.Abort() from None
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
# =============================================================================
|
|
401
|
+
# Enrich Command Group
|
|
402
|
+
# =============================================================================
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
@main.group()
|
|
406
|
+
def enrich():
|
|
407
|
+
"""Data enrichment commands."""
|
|
408
|
+
pass
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
@enrich.command(name="run")
|
|
412
|
+
@click.argument("config_file", required=False)
|
|
413
|
+
@click.option("--source-type", type=click.Choice(["csv", "duckdb", "bigquery"]), help="Data source type")
|
|
414
|
+
@click.option("--source", help="Source file path or table name")
|
|
415
|
+
@click.option("--target", help="Target file path or table name")
|
|
416
|
+
@click.option("--source-columns", help="Source columns as JSON")
|
|
417
|
+
@click.option("--enriched-columns", help="Enriched columns as JSON")
|
|
418
|
+
@click.option("--intent", help="Natural language description (AI suggests columns)")
|
|
419
|
+
@click.option("--processor", type=click.Choice(AVAILABLE_PROCESSORS), help="Processor to use")
|
|
420
|
+
def enrich_run(
|
|
421
|
+
config_file: str | None,
|
|
422
|
+
source_type: str | None,
|
|
423
|
+
source: str | None,
|
|
424
|
+
target: str | None,
|
|
425
|
+
source_columns: str | None,
|
|
426
|
+
enriched_columns: str | None,
|
|
427
|
+
intent: str | None,
|
|
428
|
+
processor: str | None,
|
|
429
|
+
):
|
|
430
|
+
"""Run data enrichment from YAML config or CLI arguments."""
|
|
431
|
+
base_args = [source_type, source, target, source_columns]
|
|
432
|
+
has_cli_args = any(arg is not None for arg in base_args) or enriched_columns or intent
|
|
433
|
+
|
|
434
|
+
if config_file and has_cli_args:
|
|
435
|
+
console.print("[bold red]Error: Provide either a config file OR CLI arguments, not both.[/bold red]")
|
|
436
|
+
raise click.Abort()
|
|
437
|
+
|
|
438
|
+
if not config_file and not has_cli_args:
|
|
439
|
+
console.print("[bold red]Error: Provide a config file or CLI arguments.[/bold red]")
|
|
440
|
+
raise click.Abort()
|
|
441
|
+
|
|
442
|
+
if has_cli_args:
|
|
443
|
+
validate_enrich_args(source_type, source, target, source_columns, enriched_columns, intent)
|
|
444
|
+
|
|
445
|
+
try:
|
|
446
|
+
if config_file:
|
|
447
|
+
console.print(f"[bold cyan]Running enrichment from {config_file}...[/bold cyan]\n")
|
|
448
|
+
run_enrichment(config_file)
|
|
449
|
+
else:
|
|
450
|
+
src_cols = parse_columns(source_columns)
|
|
451
|
+
|
|
452
|
+
if intent:
|
|
453
|
+
console.print("[dim]Getting suggestions from Parallel API...[/dim]")
|
|
454
|
+
suggestion = suggest_from_intent(intent, src_cols)
|
|
455
|
+
enr_cols = suggestion["enriched_columns"]
|
|
456
|
+
final_processor = processor or suggestion["processor"]
|
|
457
|
+
console.print(f"[green]AI suggested {len(enr_cols)} columns, processor: {final_processor}[/green]\n")
|
|
458
|
+
else:
|
|
459
|
+
enr_cols = parse_columns(enriched_columns)
|
|
460
|
+
final_processor = processor or "core-fast"
|
|
461
|
+
|
|
462
|
+
config = build_config_from_args(
|
|
463
|
+
source_type=source_type,
|
|
464
|
+
source=source,
|
|
465
|
+
target=target,
|
|
466
|
+
source_columns=src_cols,
|
|
467
|
+
enriched_columns=enr_cols,
|
|
468
|
+
processor=final_processor,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
console.print(f"[bold cyan]Running enrichment: {source} -> {target}[/bold cyan]\n")
|
|
472
|
+
run_enrichment_from_dict(config)
|
|
473
|
+
|
|
474
|
+
console.print("\n[bold green]Enrichment complete![/bold green]")
|
|
475
|
+
|
|
476
|
+
except FileNotFoundError as e:
|
|
477
|
+
console.print(f"[bold red]Error: {e}[/bold red]")
|
|
478
|
+
raise click.Abort() from None
|
|
479
|
+
except Exception as e:
|
|
480
|
+
console.print(f"[bold red]Error during enrichment: {e}[/bold red]")
|
|
481
|
+
raise
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
@enrich.command(name="plan")
|
|
485
|
+
@click.option("-o", "--output", default="config.yaml", help="Output YAML file path", show_default=True)
|
|
486
|
+
@click.option("--source-type", type=click.Choice(["csv", "duckdb", "bigquery"]), help="Data source type")
|
|
487
|
+
@click.option("--source", help="Source file path or table name")
|
|
488
|
+
@click.option("--target", help="Target file path or table name")
|
|
489
|
+
@click.option("--source-columns", help="Source columns as JSON")
|
|
490
|
+
@click.option("--enriched-columns", help="Enriched columns as JSON")
|
|
491
|
+
@click.option("--intent", help="Natural language description (AI suggests columns)")
|
|
492
|
+
@click.option("--processor", type=click.Choice(AVAILABLE_PROCESSORS), help="Processor to use")
|
|
493
|
+
def enrich_plan(
|
|
494
|
+
output: str,
|
|
495
|
+
source_type: str | None,
|
|
496
|
+
source: str | None,
|
|
497
|
+
target: str | None,
|
|
498
|
+
source_columns: str | None,
|
|
499
|
+
enriched_columns: str | None,
|
|
500
|
+
intent: str | None,
|
|
501
|
+
processor: str | None,
|
|
502
|
+
):
|
|
503
|
+
"""Create an enrichment configuration file."""
|
|
504
|
+
base_args = [source_type, source, target, source_columns]
|
|
505
|
+
has_cli_args = any(arg is not None for arg in base_args) or enriched_columns or intent
|
|
506
|
+
|
|
507
|
+
if has_cli_args:
|
|
508
|
+
validate_enrich_args(source_type, source, target, source_columns, enriched_columns, intent)
|
|
509
|
+
src_cols = parse_columns(source_columns)
|
|
510
|
+
|
|
511
|
+
if intent:
|
|
512
|
+
console.print("[dim]Getting suggestions from Parallel API...[/dim]")
|
|
513
|
+
suggestion = suggest_from_intent(intent, src_cols)
|
|
514
|
+
enr_cols = suggestion["enriched_columns"]
|
|
515
|
+
final_processor = processor or suggestion["processor"]
|
|
516
|
+
console.print(f"[green]AI suggested {len(enr_cols)} columns, processor: {final_processor}[/green]")
|
|
517
|
+
else:
|
|
518
|
+
enr_cols = parse_columns(enriched_columns)
|
|
519
|
+
final_processor = processor or "core-fast"
|
|
520
|
+
|
|
521
|
+
config = build_config_from_args(
|
|
522
|
+
source_type=source_type,
|
|
523
|
+
source=source,
|
|
524
|
+
target=target,
|
|
525
|
+
source_columns=src_cols,
|
|
526
|
+
enriched_columns=enr_cols,
|
|
527
|
+
processor=final_processor,
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
save_config(config, output)
|
|
531
|
+
console.print(f"[bold green]Configuration saved to {output}[/bold green]")
|
|
532
|
+
else:
|
|
533
|
+
try:
|
|
534
|
+
config = create_config_interactive()
|
|
535
|
+
save_config(config, output)
|
|
536
|
+
except KeyboardInterrupt:
|
|
537
|
+
console.print("\n[yellow]Configuration creation cancelled.[/yellow]")
|
|
538
|
+
raise click.Abort() from None
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
@enrich.command(name="suggest")
|
|
542
|
+
@click.argument("intent")
|
|
543
|
+
@click.option("--source-columns", help="Source columns as JSON")
|
|
544
|
+
@click.option("--json", "output_json", is_flag=True, help="Output as JSON")
|
|
545
|
+
def enrich_suggest(intent: str, source_columns: str | None, output_json: bool):
|
|
546
|
+
"""Use AI to suggest output columns and processor."""
|
|
547
|
+
try:
|
|
548
|
+
src_cols = parse_columns(source_columns) if source_columns else None
|
|
549
|
+
|
|
550
|
+
if not output_json:
|
|
551
|
+
console.print("[dim]Getting suggestions from Parallel API...[/dim]\n")
|
|
552
|
+
|
|
553
|
+
result = suggest_from_intent(intent, src_cols)
|
|
554
|
+
|
|
555
|
+
if output_json:
|
|
556
|
+
print(json.dumps(result, indent=2))
|
|
557
|
+
else:
|
|
558
|
+
if result.get("title"):
|
|
559
|
+
console.print(f"[bold]Task: {result['title']}[/bold]\n")
|
|
560
|
+
|
|
561
|
+
console.print(f"[bold green]Recommended Processor:[/bold green] {result['processor']}\n")
|
|
562
|
+
|
|
563
|
+
console.print("[bold green]Suggested Output Columns:[/bold green]")
|
|
564
|
+
for col in result["enriched_columns"]:
|
|
565
|
+
console.print(f" [cyan]{col['name']}[/cyan] ({col['type']}): {col['description']}")
|
|
566
|
+
|
|
567
|
+
if result.get("warnings"):
|
|
568
|
+
console.print("\n[yellow]Warnings:[/yellow]")
|
|
569
|
+
for warning in result["warnings"]:
|
|
570
|
+
console.print(f" {warning}")
|
|
571
|
+
|
|
572
|
+
console.print("\n[dim]JSON (for --enriched-columns):[/dim]")
|
|
573
|
+
console.print(json.dumps(result["enriched_columns"]))
|
|
574
|
+
|
|
575
|
+
except Exception as e:
|
|
576
|
+
console.print(f"[bold red]Error: {e}[/bold red]")
|
|
577
|
+
raise click.Abort() from None
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
@enrich.command(name="deploy")
|
|
581
|
+
@click.option("--system", type=click.Choice(["bigquery"]), required=True, help="Target system to deploy to")
|
|
582
|
+
@click.option("--project", "-p", help="Cloud project ID (required for bigquery)")
|
|
583
|
+
@click.option("--region", "-r", default="us-central1", show_default=True, help="Cloud region")
|
|
584
|
+
@click.option("--api-key", "-k", help="Parallel API key (or use PARALLEL_API_KEY env var)")
|
|
585
|
+
@click.option("--dataset", default="parallel_functions", show_default=True, help="Dataset name (BigQuery)")
|
|
586
|
+
def enrich_deploy(system: str, project: str | None, region: str, api_key: str | None, dataset: str):
|
|
587
|
+
"""Deploy Parallel enrichment to a cloud system."""
|
|
588
|
+
if system == "bigquery":
|
|
589
|
+
if not project:
|
|
590
|
+
console.print("[bold red]Error: --project is required for BigQuery deployment.[/bold red]")
|
|
591
|
+
raise click.Abort()
|
|
592
|
+
|
|
593
|
+
from parallel_web_tools.integrations.bigquery import deploy_bigquery_integration
|
|
594
|
+
|
|
595
|
+
if not api_key:
|
|
596
|
+
api_key = os.environ.get("PARALLEL_API_KEY")
|
|
597
|
+
if not api_key:
|
|
598
|
+
try:
|
|
599
|
+
api_key = get_api_key()
|
|
600
|
+
except Exception:
|
|
601
|
+
pass
|
|
602
|
+
if not api_key:
|
|
603
|
+
console.print("[bold red]Error: Parallel API key required[/bold red]")
|
|
604
|
+
console.print(" Use --api-key, PARALLEL_API_KEY env var, or run 'parallel-cli login'")
|
|
605
|
+
raise click.Abort()
|
|
606
|
+
|
|
607
|
+
console.print(f"[bold cyan]Deploying to BigQuery in {project}...[/bold cyan]\n")
|
|
608
|
+
|
|
609
|
+
try:
|
|
610
|
+
result = deploy_bigquery_integration(
|
|
611
|
+
project_id=project,
|
|
612
|
+
api_key=api_key,
|
|
613
|
+
region=region,
|
|
614
|
+
dataset_id=dataset,
|
|
615
|
+
)
|
|
616
|
+
console.print("\n[bold green]Deployment complete![/bold green]")
|
|
617
|
+
console.print(f"\nFunction URL: {result['function_url']}")
|
|
618
|
+
console.print("\n[cyan]Example query:[/cyan]")
|
|
619
|
+
console.print(result["example_query"])
|
|
620
|
+
except Exception as e:
|
|
621
|
+
console.print(f"[bold red]Deployment failed: {e}[/bold red]")
|
|
622
|
+
raise click.Abort() from None
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
if __name__ == "__main__":
|
|
626
|
+
main()
|