parallel-web-tools 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. parallel_web_tools/__init__.py +56 -0
  2. parallel_web_tools/cli/__init__.py +5 -0
  3. parallel_web_tools/cli/commands.py +626 -0
  4. parallel_web_tools/cli/planner.py +438 -0
  5. parallel_web_tools/core/__init__.py +70 -0
  6. parallel_web_tools/core/auth.py +256 -0
  7. parallel_web_tools/core/batch.py +307 -0
  8. parallel_web_tools/core/result.py +29 -0
  9. parallel_web_tools/core/runner.py +75 -0
  10. parallel_web_tools/core/schema.py +169 -0
  11. parallel_web_tools/integrations/__init__.py +17 -0
  12. parallel_web_tools/integrations/bigquery/__init__.py +34 -0
  13. parallel_web_tools/integrations/bigquery/cloud_function/main.py +199 -0
  14. parallel_web_tools/integrations/bigquery/cloud_function/requirements.txt +5 -0
  15. parallel_web_tools/integrations/bigquery/deploy.py +456 -0
  16. parallel_web_tools/integrations/bigquery/sql/create_functions.sql +49 -0
  17. parallel_web_tools/integrations/duckdb/__init__.py +63 -0
  18. parallel_web_tools/integrations/duckdb/batch.py +220 -0
  19. parallel_web_tools/integrations/duckdb/udf.py +159 -0
  20. parallel_web_tools/integrations/polars/__init__.py +37 -0
  21. parallel_web_tools/integrations/polars/enrich.py +218 -0
  22. parallel_web_tools/integrations/snowflake/__init__.py +46 -0
  23. parallel_web_tools/integrations/snowflake/deploy.py +347 -0
  24. parallel_web_tools/integrations/snowflake/sql/01_setup.sql +97 -0
  25. parallel_web_tools/integrations/snowflake/sql/02_create_udf.sql +260 -0
  26. parallel_web_tools/integrations/snowflake/sql/03_cleanup.sql +61 -0
  27. parallel_web_tools/integrations/spark/__init__.py +57 -0
  28. parallel_web_tools/integrations/spark/streaming.py +404 -0
  29. parallel_web_tools/integrations/spark/udf.py +206 -0
  30. parallel_web_tools/integrations/utils.py +32 -0
  31. parallel_web_tools/processors/__init__.py +20 -0
  32. parallel_web_tools/processors/bigquery.py +70 -0
  33. parallel_web_tools/processors/csv.py +32 -0
  34. parallel_web_tools/processors/duckdb.py +25 -0
  35. parallel_web_tools-0.0.1.dist-info/METADATA +346 -0
  36. parallel_web_tools-0.0.1.dist-info/RECORD +38 -0
  37. parallel_web_tools-0.0.1.dist-info/WHEEL +4 -0
  38. parallel_web_tools-0.0.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,438 @@
1
+ """Interactive planner for creating YAML configuration files."""
2
+
3
+ import csv
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import duckdb
9
+ import questionary
10
+ import yaml
11
+ from questionary import Style
12
+ from rich.console import Console
13
+ from rich.panel import Panel
14
+ from rich.table import Table
15
+ from rich.text import Text
16
+
17
+ from parallel_web_tools.core import get_client
18
+ from parallel_web_tools.core.schema import JSON_SCHEMA_TYPE_MAP, get_available_types
19
+
20
+ # Custom style for questionary
21
+ custom_style = Style(
22
+ [
23
+ ("qmark", "fg:#673ab7 bold"),
24
+ ("question", "bold"),
25
+ ("answer", "fg:#f44336 bold"),
26
+ ("pointer", "fg:#673ab7 bold"),
27
+ ("highlighted", "fg:#673ab7 bold"),
28
+ ("selected", "fg:#cc5454"),
29
+ ("separator", "fg:#cc5454"),
30
+ ("instruction", ""),
31
+ ("text", ""),
32
+ ("disabled", "fg:#858585 italic"),
33
+ ]
34
+ )
35
+
36
+ console = Console()
37
+
38
+
39
+ def get_available_processors() -> list[str]:
40
+ """Get available processors."""
41
+ from parallel_web_tools.core.schema import AVAILABLE_PROCESSORS
42
+
43
+ return AVAILABLE_PROCESSORS
44
+
45
+
46
+ def suggest_output_columns(source_columns: list[dict[str, str]], user_intent: str) -> list[dict[str, str]] | None:
47
+ """Use the Parallel Ingest API to suggest output columns."""
48
+ try:
49
+ client = get_client()
50
+
51
+ input_properties = {}
52
+ for col in source_columns:
53
+ input_properties[col["name"]] = {
54
+ "type": "string",
55
+ "description": col.get("description", ""),
56
+ }
57
+
58
+ previous_task = {
59
+ "input_schema": {
60
+ "type": "object",
61
+ "properties": input_properties,
62
+ "required": list(input_properties.keys()),
63
+ }
64
+ }
65
+
66
+ response = client.post(
67
+ path="/v1beta/tasks/suggest",
68
+ body={"user_intent": user_intent, "previous_task": previous_task},
69
+ cast_to=dict,
70
+ )
71
+
72
+ output_schema = response.get("output_schema", {})
73
+ properties = output_schema.get("properties", {})
74
+
75
+ suggested_columns = []
76
+ for name, prop in properties.items():
77
+ col_type = prop.get("type", "string")
78
+ mapped_type = JSON_SCHEMA_TYPE_MAP.get(col_type, "str")
79
+ suggested_columns.append(
80
+ {
81
+ "name": name,
82
+ "description": prop.get("description", ""),
83
+ "type": mapped_type,
84
+ }
85
+ )
86
+
87
+ return suggested_columns
88
+
89
+ except Exception as e:
90
+ console.print(f"[yellow]Warning: Could not get suggestions: {e}[/yellow]")
91
+ return None
92
+
93
+
94
+ def print_header():
95
+ """Print a beautiful header."""
96
+ header = Text()
97
+ header.append("Parallel Data Enrichment Planner", style="bold magenta")
98
+ console.print(Panel(header, border_style="magenta"))
99
+ console.print()
100
+
101
+
102
+ def get_source_type() -> str:
103
+ """Prompt user to select source type."""
104
+ return questionary.select(
105
+ "What type of data source are you using?",
106
+ choices=[
107
+ questionary.Choice("CSV File", value="csv"),
108
+ questionary.Choice("DuckDB Database", value="duckdb"),
109
+ questionary.Choice("Google BigQuery", value="bigquery"),
110
+ ],
111
+ style=custom_style,
112
+ ).ask()
113
+
114
+
115
+ def get_csv_columns(file_path: str) -> list[str]:
116
+ """Get column names from CSV file."""
117
+ try:
118
+ with open(file_path) as f:
119
+ reader = csv.DictReader(f)
120
+ return list(reader.fieldnames or [])
121
+ except Exception as e:
122
+ console.print(f"[yellow]Warning: Could not read CSV file: {e}[/yellow]")
123
+ return []
124
+
125
+
126
+ def get_duckdb_columns(db_path: str, table_name: str) -> list[str]:
127
+ """Get column names from DuckDB table."""
128
+ try:
129
+ with duckdb.connect(db_path) as con:
130
+ result = con.execute(
131
+ f"SELECT column_name FROM information_schema.columns WHERE table_name = '{table_name}'"
132
+ ).fetchall()
133
+ return [row[0] for row in result]
134
+ except Exception as e:
135
+ console.print(f"[yellow]Warning: Could not read DuckDB table: {e}[/yellow]")
136
+ return []
137
+
138
+
139
+ def get_bigquery_columns(project: str, table_name: str) -> list[str]:
140
+ """Get column names from BigQuery table."""
141
+ try:
142
+ from sqlalchemy import create_engine, inspect
143
+
144
+ engine = create_engine(f"bigquery://{project}")
145
+ insp = inspect(engine)
146
+
147
+ parts = table_name.split(".")
148
+ if len(parts) >= 2:
149
+ schema = parts[-2]
150
+ table = parts[-1]
151
+ columns = insp.get_columns(table, schema=schema)
152
+ return [col["name"] for col in columns]
153
+ except Exception as e:
154
+ console.print(f"[yellow]Warning: Could not read BigQuery table: {e}[/yellow]")
155
+ return []
156
+
157
+
158
+ def prompt_for_columns(
159
+ column_names: list[str],
160
+ prompt_text: str,
161
+ allow_new: bool = True,
162
+ prompt_for_type: bool = False,
163
+ ) -> list[dict[str, str]]:
164
+ """Prompt user to select and describe columns."""
165
+ columns = []
166
+ type_choices = get_available_types()
167
+
168
+ if column_names:
169
+ console.print(f"\n[bold cyan]{prompt_text}[/bold cyan]")
170
+ selected = questionary.checkbox(
171
+ "Select columns (use space to select, enter to confirm):",
172
+ choices=column_names,
173
+ style=custom_style,
174
+ ).ask()
175
+
176
+ if not selected:
177
+ selected = []
178
+ else:
179
+ selected = []
180
+
181
+ for col in selected:
182
+ description = questionary.text(f"Description for '{col}':", style=custom_style).ask()
183
+ col_dict = {"name": col, "description": description}
184
+
185
+ if prompt_for_type:
186
+ col_type = questionary.select(
187
+ f"Type for '{col}':",
188
+ choices=type_choices,
189
+ default="str",
190
+ style=custom_style,
191
+ ).ask()
192
+ col_dict["type"] = col_type
193
+
194
+ columns.append(col_dict)
195
+
196
+ if allow_new:
197
+ while True:
198
+ if columns:
199
+ add_more = questionary.confirm("Add another column?", default=False, style=custom_style).ask()
200
+ if not add_more:
201
+ break
202
+
203
+ col_name = questionary.text("Column name:", style=custom_style).ask()
204
+ col_desc = questionary.text(f"Description for '{col_name}':", style=custom_style).ask()
205
+ col_dict = {"name": col_name, "description": col_desc}
206
+
207
+ if prompt_for_type:
208
+ col_type = questionary.select(
209
+ f"Type for '{col_name}':",
210
+ choices=type_choices,
211
+ default="str",
212
+ style=custom_style,
213
+ ).ask()
214
+ col_dict["type"] = col_type
215
+
216
+ columns.append(col_dict)
217
+
218
+ return columns
219
+
220
+
221
+ def display_summary(config: dict[str, Any]):
222
+ """Display a summary of the configuration."""
223
+ console.print("\n[bold green]Configuration Summary[/bold green]\n")
224
+
225
+ info_table = Table(show_header=False, box=None, padding=(0, 2))
226
+ info_table.add_column(style="cyan bold")
227
+ info_table.add_column()
228
+
229
+ info_table.add_row("Source Type:", config["source_type"].upper())
230
+ info_table.add_row("Source:", config["source"])
231
+ info_table.add_row("Target:", config["target"])
232
+ info_table.add_row("Processor:", config.get("processor", "core-fast"))
233
+
234
+ console.print(info_table)
235
+ console.print()
236
+
237
+ if config["source_columns"]:
238
+ console.print("[bold]Source Columns:[/bold]")
239
+ src_table = Table(show_header=True, header_style="bold magenta")
240
+ src_table.add_column("Column Name", style="cyan")
241
+ src_table.add_column("Description")
242
+
243
+ for col in config["source_columns"]:
244
+ src_table.add_row(col["name"], col["description"])
245
+
246
+ console.print(src_table)
247
+ console.print()
248
+
249
+ if config["enriched_columns"]:
250
+ console.print("[bold]Enriched Columns:[/bold]")
251
+ enr_table = Table(show_header=True, header_style="bold green")
252
+ enr_table.add_column("Column Name", style="green")
253
+ enr_table.add_column("Type", style="yellow")
254
+ enr_table.add_column("Description")
255
+
256
+ for col in config["enriched_columns"]:
257
+ col_type = col.get("type", "str")
258
+ enr_table.add_row(col["name"], col_type, col["description"])
259
+
260
+ console.print(enr_table)
261
+ console.print()
262
+
263
+
264
+ def create_config_interactive() -> dict[str, Any]:
265
+ """Interactive configuration creation."""
266
+ print_header()
267
+
268
+ config: dict[str, Any] = {}
269
+
270
+ source_type = get_source_type()
271
+ config["source_type"] = source_type
272
+
273
+ if source_type == "csv":
274
+ source_path = questionary.path("Path to source CSV file:", style=custom_style).ask()
275
+ config["source"] = source_path
276
+ target_path = questionary.text(
277
+ "Path to target CSV file:",
278
+ default=source_path.replace(".csv", "_enriched.csv"),
279
+ style=custom_style,
280
+ ).ask()
281
+ config["target"] = target_path
282
+ detected_columns = get_csv_columns(source_path)
283
+
284
+ elif source_type == "duckdb":
285
+ db_path = questionary.text(
286
+ "Path to DuckDB database:",
287
+ default=os.getenv("DUCKDB_FILE", "data/file.db"),
288
+ style=custom_style,
289
+ ).ask()
290
+ source_table = questionary.text("Source table name:", style=custom_style).ask()
291
+ config["source"] = source_table
292
+ target_table = questionary.text(
293
+ "Target table name:",
294
+ default=f"{source_table}_enriched",
295
+ style=custom_style,
296
+ ).ask()
297
+ config["target"] = target_table
298
+ detected_columns = get_duckdb_columns(db_path, source_table)
299
+
300
+ elif source_type == "bigquery":
301
+ project = questionary.text(
302
+ "Google Cloud Project ID:",
303
+ default=os.getenv("BIGQUERY_PROJECT", ""),
304
+ style=custom_style,
305
+ ).ask()
306
+ source_table = questionary.text(
307
+ "Source table (format: dataset.table or project.dataset.table):",
308
+ style=custom_style,
309
+ ).ask()
310
+ config["source"] = source_table
311
+ target_table = questionary.text(
312
+ "Target table:",
313
+ default=f"{source_table}_enriched",
314
+ style=custom_style,
315
+ ).ask()
316
+ config["target"] = target_table
317
+ detected_columns = get_bigquery_columns(project, source_table)
318
+ else:
319
+ raise NotImplementedError(f"{source_type} not a supported source type")
320
+
321
+ if detected_columns:
322
+ console.print(f"\n[bold green]Detected {len(detected_columns)} columns[/bold green]")
323
+ console.print(", ".join(detected_columns[:10]))
324
+ if len(detected_columns) > 10:
325
+ console.print(f"... and {len(detected_columns) - 10} more")
326
+
327
+ console.print()
328
+ source_columns = []
329
+ while len(source_columns) == 0:
330
+ source_columns = prompt_for_columns(
331
+ detected_columns,
332
+ "Select source columns to use as input for enrichment:",
333
+ allow_new=True,
334
+ )
335
+ if len(source_columns) == 0:
336
+ console.print("[bold red]Error: At least one source column is required![/bold red]\n")
337
+
338
+ config["source_columns"] = source_columns
339
+
340
+ console.print()
341
+ console.print("[bold yellow]Now define the new columns you want to enrich your data with.[/bold yellow]")
342
+
343
+ use_suggestions = questionary.confirm(
344
+ "Would you like AI to suggest output columns based on your intent?",
345
+ default=True,
346
+ style=custom_style,
347
+ ).ask()
348
+
349
+ enriched_columns = []
350
+
351
+ if use_suggestions:
352
+ user_intent = questionary.text(
353
+ "Describe what you want to enrich (e.g., 'Find the CEO and company valuation'):",
354
+ style=custom_style,
355
+ ).ask()
356
+
357
+ if user_intent:
358
+ console.print("\n[dim]Getting suggestions from Parallel API...[/dim]")
359
+ suggested = suggest_output_columns(source_columns, user_intent)
360
+
361
+ if suggested:
362
+ console.print(f"\n[bold green]AI suggested {len(suggested)} output columns:[/bold green]\n")
363
+
364
+ suggest_table = Table(show_header=True, header_style="bold green")
365
+ suggest_table.add_column("Column Name", style="green")
366
+ suggest_table.add_column("Type", style="yellow")
367
+ suggest_table.add_column("Description")
368
+
369
+ for col in suggested:
370
+ suggest_table.add_row(col["name"], col.get("type", "str"), col["description"])
371
+
372
+ console.print(suggest_table)
373
+ console.print()
374
+
375
+ accept_suggestions = questionary.select(
376
+ "How would you like to proceed?",
377
+ choices=[
378
+ questionary.Choice("Accept all suggestions", value="accept"),
379
+ questionary.Choice("Select which to keep", value="select"),
380
+ questionary.Choice("Start fresh (ignore suggestions)", value="ignore"),
381
+ ],
382
+ style=custom_style,
383
+ ).ask()
384
+
385
+ if accept_suggestions == "accept":
386
+ enriched_columns = suggested
387
+ elif accept_suggestions == "select":
388
+ selected_names = questionary.checkbox(
389
+ "Select columns to keep:",
390
+ choices=[col["name"] for col in suggested],
391
+ style=custom_style,
392
+ ).ask()
393
+ enriched_columns = [col for col in suggested if col["name"] in selected_names]
394
+
395
+ if len(enriched_columns) == 0:
396
+ console.print("[dim](At least one enriched column is required)[/dim]\n")
397
+
398
+ while len(enriched_columns) == 0:
399
+ enriched_columns = prompt_for_columns([], "Define enriched columns:", allow_new=True, prompt_for_type=True)
400
+ if len(enriched_columns) == 0:
401
+ console.print("[bold red]Error: At least one enriched column is required![/bold red]\n")
402
+
403
+ if enriched_columns and use_suggestions:
404
+ add_more = questionary.confirm("Add additional columns?", default=False, style=custom_style).ask()
405
+ if add_more:
406
+ additional = prompt_for_columns([], "Add more columns:", allow_new=True, prompt_for_type=True)
407
+ enriched_columns.extend(additional)
408
+
409
+ config["enriched_columns"] = enriched_columns
410
+
411
+ console.print()
412
+ console.print("[bold cyan]Select the Parallel API processor to use:[/bold cyan]")
413
+ console.print("[dim](See https://parallel.ai/pricing for details)[/dim]\n")
414
+
415
+ processor_choices = get_available_processors()
416
+ processor = questionary.select(
417
+ "Which processor would you like to use?",
418
+ choices=processor_choices,
419
+ default="core-fast",
420
+ style=custom_style,
421
+ ).ask()
422
+ config["processor"] = processor
423
+
424
+ display_summary(config)
425
+ return config
426
+
427
+
428
+ def save_config(config: dict[str, Any], output_path: str):
429
+ """Save configuration to YAML file."""
430
+ output_file = Path(output_path)
431
+ output_file.parent.mkdir(parents=True, exist_ok=True)
432
+
433
+ with open(output_path, "w") as f:
434
+ yaml.dump(config, f, default_flow_style=False, sort_keys=False)
435
+
436
+ console.print(f"\n[bold green]Configuration saved to {output_path}[/bold green]")
437
+ console.print("\n[cyan]Run your enrichment with:[/cyan]")
438
+ console.print(f"[bold] parallel-cli enrich run {output_path}[/bold]\n")
@@ -0,0 +1,70 @@
1
+ """Core functionality for Parallel Data."""
2
+
3
+ from parallel_web_tools.core.auth import (
4
+ get_api_key,
5
+ get_async_client,
6
+ get_auth_status,
7
+ get_client,
8
+ logout,
9
+ resolve_api_key,
10
+ )
11
+ from parallel_web_tools.core.batch import (
12
+ build_output_schema,
13
+ enrich_batch,
14
+ enrich_single,
15
+ extract_basis,
16
+ run_tasks,
17
+ )
18
+ from parallel_web_tools.core.result import EnrichmentResult
19
+ from parallel_web_tools.core.runner import (
20
+ run_enrichment,
21
+ run_enrichment_from_dict,
22
+ )
23
+ from parallel_web_tools.core.schema import (
24
+ AVAILABLE_PROCESSORS,
25
+ JSON_SCHEMA_TYPE_MAP,
26
+ TYPE_MAP,
27
+ Column,
28
+ InputSchema,
29
+ ParseError,
30
+ ProcessorType,
31
+ SourceType,
32
+ get_available_types,
33
+ load_schema,
34
+ parse_input_and_output_models,
35
+ parse_schema,
36
+ )
37
+
38
+ __all__ = [
39
+ # Auth
40
+ "get_api_key",
41
+ "get_auth_status",
42
+ "get_client",
43
+ "get_async_client",
44
+ "logout",
45
+ "resolve_api_key",
46
+ # Schema
47
+ "AVAILABLE_PROCESSORS",
48
+ "Column",
49
+ "InputSchema",
50
+ "JSON_SCHEMA_TYPE_MAP",
51
+ "ParseError",
52
+ "ProcessorType",
53
+ "SourceType",
54
+ "TYPE_MAP",
55
+ "get_available_types",
56
+ "load_schema",
57
+ "parse_schema",
58
+ "parse_input_and_output_models",
59
+ # Batch
60
+ "build_output_schema",
61
+ "enrich_batch",
62
+ "enrich_single",
63
+ "extract_basis",
64
+ "run_tasks",
65
+ # Runner
66
+ "run_enrichment",
67
+ "run_enrichment_from_dict",
68
+ # Result
69
+ "EnrichmentResult",
70
+ ]