anysite-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of anysite-cli might be problematic. Click here for more details.
- anysite/__init__.py +4 -0
- anysite/__main__.py +6 -0
- anysite/api/__init__.py +21 -0
- anysite/api/client.py +271 -0
- anysite/api/errors.py +137 -0
- anysite/api/schemas.py +333 -0
- anysite/batch/__init__.py +1 -0
- anysite/batch/executor.py +176 -0
- anysite/batch/input.py +160 -0
- anysite/batch/rate_limiter.py +98 -0
- anysite/cli/__init__.py +1 -0
- anysite/cli/config.py +176 -0
- anysite/cli/executor.py +388 -0
- anysite/cli/options.py +249 -0
- anysite/config/__init__.py +11 -0
- anysite/config/paths.py +46 -0
- anysite/config/settings.py +187 -0
- anysite/dataset/__init__.py +37 -0
- anysite/dataset/analyzer.py +268 -0
- anysite/dataset/cli.py +644 -0
- anysite/dataset/collector.py +686 -0
- anysite/dataset/db_loader.py +248 -0
- anysite/dataset/errors.py +30 -0
- anysite/dataset/exporters.py +121 -0
- anysite/dataset/history.py +153 -0
- anysite/dataset/models.py +245 -0
- anysite/dataset/notifications.py +87 -0
- anysite/dataset/scheduler.py +107 -0
- anysite/dataset/storage.py +171 -0
- anysite/dataset/transformer.py +213 -0
- anysite/db/__init__.py +38 -0
- anysite/db/adapters/__init__.py +1 -0
- anysite/db/adapters/base.py +158 -0
- anysite/db/adapters/postgres.py +201 -0
- anysite/db/adapters/sqlite.py +183 -0
- anysite/db/cli.py +687 -0
- anysite/db/config.py +92 -0
- anysite/db/manager.py +166 -0
- anysite/db/operations/__init__.py +1 -0
- anysite/db/operations/insert.py +199 -0
- anysite/db/operations/query.py +43 -0
- anysite/db/schema/__init__.py +1 -0
- anysite/db/schema/inference.py +213 -0
- anysite/db/schema/types.py +71 -0
- anysite/db/utils/__init__.py +1 -0
- anysite/db/utils/sanitize.py +99 -0
- anysite/main.py +498 -0
- anysite/models/__init__.py +1 -0
- anysite/output/__init__.py +11 -0
- anysite/output/console.py +45 -0
- anysite/output/formatters.py +301 -0
- anysite/output/templates.py +76 -0
- anysite/py.typed +0 -0
- anysite/streaming/__init__.py +1 -0
- anysite/streaming/progress.py +121 -0
- anysite/streaming/writer.py +130 -0
- anysite/utils/__init__.py +1 -0
- anysite/utils/fields.py +242 -0
- anysite/utils/retry.py +109 -0
- anysite_cli-0.1.0.dist-info/METADATA +437 -0
- anysite_cli-0.1.0.dist-info/RECORD +64 -0
- anysite_cli-0.1.0.dist-info/WHEEL +4 -0
- anysite_cli-0.1.0.dist-info/entry_points.txt +2 -0
- anysite_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
anysite/api/schemas.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
"""Dynamic OpenAPI schema cache for endpoint discovery.
|
|
2
|
+
|
|
3
|
+
Fetches the OpenAPI spec from the API, resolves $ref references,
|
|
4
|
+
and caches a compact representation of all endpoints with their
|
|
5
|
+
input parameters and output fields.
|
|
6
|
+
|
|
7
|
+
Used by `anysite describe` and `anysite schema update` commands.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
from datetime import UTC, datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import httpx
|
|
18
|
+
|
|
19
|
+
OPENAPI_URL = "https://api.anysite.io/openapi.json"
|
|
20
|
+
CACHE_VERSION = "0.0.1"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_cache_path() -> Path:
|
|
24
|
+
"""Get the schema cache file path."""
|
|
25
|
+
from anysite.config.paths import get_schema_cache_path
|
|
26
|
+
|
|
27
|
+
return get_schema_cache_path()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def resolve_ref(spec: dict[str, Any], ref_path: str) -> dict[str, Any]:
|
|
31
|
+
"""Resolve a $ref path within the OpenAPI spec.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
spec: Full OpenAPI spec dict.
|
|
35
|
+
ref_path: Reference string like '#/components/schemas/UserProfile'.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
The resolved schema dict.
|
|
39
|
+
"""
|
|
40
|
+
parts = ref_path.lstrip("#/").split("/")
|
|
41
|
+
node: Any = spec
|
|
42
|
+
for part in parts:
|
|
43
|
+
node = node[part]
|
|
44
|
+
return node # type: ignore[no-any-return]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _resolve_schema(spec: dict[str, Any], schema: dict[str, Any]) -> dict[str, Any]:
|
|
48
|
+
"""Recursively resolve all $ref in a schema."""
|
|
49
|
+
if "$ref" in schema:
|
|
50
|
+
resolved = resolve_ref(spec, schema["$ref"])
|
|
51
|
+
return _resolve_schema(spec, resolved)
|
|
52
|
+
|
|
53
|
+
if "allOf" in schema:
|
|
54
|
+
merged: dict[str, Any] = {}
|
|
55
|
+
for sub in schema["allOf"]:
|
|
56
|
+
resolved_sub = _resolve_schema(spec, sub)
|
|
57
|
+
merged.update(resolved_sub.get("properties", {}))
|
|
58
|
+
return {"type": "object", "properties": merged}
|
|
59
|
+
|
|
60
|
+
if "anyOf" in schema or "oneOf" in schema:
|
|
61
|
+
variants = schema.get("anyOf") or schema.get("oneOf", [])
|
|
62
|
+
# Pick first non-null variant
|
|
63
|
+
for variant in variants:
|
|
64
|
+
resolved_v = _resolve_schema(spec, variant)
|
|
65
|
+
if resolved_v.get("type") != "null":
|
|
66
|
+
return resolved_v
|
|
67
|
+
return variants[0] if variants else schema
|
|
68
|
+
|
|
69
|
+
return schema
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _simplify_type(schema: dict[str, Any]) -> str:
|
|
73
|
+
"""Convert a resolved schema to a simple type string."""
|
|
74
|
+
if "anyOf" in schema or "oneOf" in schema:
|
|
75
|
+
variants = schema.get("anyOf") or schema.get("oneOf", [])
|
|
76
|
+
types = []
|
|
77
|
+
for v in variants:
|
|
78
|
+
t = v.get("type", "object")
|
|
79
|
+
if t != "null":
|
|
80
|
+
types.append(t)
|
|
81
|
+
return types[0] if types else "object"
|
|
82
|
+
|
|
83
|
+
return schema.get("type", "object")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _extract_properties(spec: dict[str, Any], schema: dict[str, Any]) -> dict[str, str]:
|
|
87
|
+
"""Extract flat field -> type mapping from a resolved schema."""
|
|
88
|
+
schema = _resolve_schema(spec, schema)
|
|
89
|
+
|
|
90
|
+
# Handle array of objects (most API responses)
|
|
91
|
+
if schema.get("type") == "array" and "items" in schema:
|
|
92
|
+
schema = _resolve_schema(spec, schema["items"])
|
|
93
|
+
|
|
94
|
+
properties = schema.get("properties", {})
|
|
95
|
+
result: dict[str, str] = {}
|
|
96
|
+
for field_name, field_schema in properties.items():
|
|
97
|
+
resolved_field = _resolve_schema(spec, field_schema)
|
|
98
|
+
result[field_name] = _simplify_type(resolved_field)
|
|
99
|
+
return result
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _extract_input(spec: dict[str, Any], method_data: dict[str, Any]) -> dict[str, dict[str, Any]]:
|
|
103
|
+
"""Extract input parameters from request body schema."""
|
|
104
|
+
request_body = method_data.get("requestBody", {})
|
|
105
|
+
content = request_body.get("content", {})
|
|
106
|
+
json_content = content.get("application/json", {})
|
|
107
|
+
body_schema = json_content.get("schema", {})
|
|
108
|
+
|
|
109
|
+
if not body_schema:
|
|
110
|
+
return {}
|
|
111
|
+
|
|
112
|
+
resolved = _resolve_schema(spec, body_schema)
|
|
113
|
+
properties = resolved.get("properties", {})
|
|
114
|
+
required_fields = set(resolved.get("required", []))
|
|
115
|
+
|
|
116
|
+
result: dict[str, dict[str, Any]] = {}
|
|
117
|
+
for field_name, field_schema in properties.items():
|
|
118
|
+
resolved_field = _resolve_schema(spec, field_schema)
|
|
119
|
+
result[field_name] = {
|
|
120
|
+
"type": _simplify_type(resolved_field),
|
|
121
|
+
"required": field_name in required_fields,
|
|
122
|
+
"description": resolved_field.get("description", ""),
|
|
123
|
+
}
|
|
124
|
+
return result
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def extract_endpoint_info(
|
|
128
|
+
spec: dict[str, Any], method_data: dict[str, Any]
|
|
129
|
+
) -> dict[str, Any]:
|
|
130
|
+
"""Extract input/output info for a single endpoint.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
spec: Full OpenAPI spec.
|
|
134
|
+
method_data: The method dict (e.g. spec['paths'][path]['post']).
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Dict with description, tags, input, output.
|
|
138
|
+
"""
|
|
139
|
+
description = method_data.get("description", "") or method_data.get("summary", "")
|
|
140
|
+
tags = method_data.get("tags", [])
|
|
141
|
+
|
|
142
|
+
# Extract input params
|
|
143
|
+
input_params = _extract_input(spec, method_data)
|
|
144
|
+
|
|
145
|
+
# Extract output fields from 200 response
|
|
146
|
+
responses = method_data.get("responses", {})
|
|
147
|
+
success_resp = responses.get("200", responses.get("201", {}))
|
|
148
|
+
resp_content = success_resp.get("content", {})
|
|
149
|
+
resp_json = resp_content.get("application/json", {})
|
|
150
|
+
resp_schema = resp_json.get("schema", {})
|
|
151
|
+
|
|
152
|
+
output_fields = _extract_properties(spec, resp_schema) if resp_schema else {}
|
|
153
|
+
|
|
154
|
+
return {
|
|
155
|
+
"description": description,
|
|
156
|
+
"tags": tags,
|
|
157
|
+
"input": input_params,
|
|
158
|
+
"output": output_fields,
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def fetch_and_parse_openapi(url: str | None = None) -> dict[str, Any]:
|
|
163
|
+
"""Fetch the OpenAPI spec and parse all endpoints into a compact cache format.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
url: OpenAPI spec URL. Defaults to OPENAPI_URL.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Cache dict with version, updated_at, and endpoints.
|
|
170
|
+
"""
|
|
171
|
+
spec_url = url or OPENAPI_URL
|
|
172
|
+
response = httpx.get(spec_url, timeout=30, follow_redirects=True)
|
|
173
|
+
response.raise_for_status()
|
|
174
|
+
spec = response.json()
|
|
175
|
+
|
|
176
|
+
endpoints: dict[str, Any] = {}
|
|
177
|
+
|
|
178
|
+
for path, path_data in spec.get("paths", {}).items():
|
|
179
|
+
# Only process POST endpoints (API pattern)
|
|
180
|
+
for method in ("post", "get"):
|
|
181
|
+
if method not in path_data:
|
|
182
|
+
continue
|
|
183
|
+
method_data = path_data[method]
|
|
184
|
+
try:
|
|
185
|
+
info = extract_endpoint_info(spec, method_data)
|
|
186
|
+
endpoints[path] = info
|
|
187
|
+
except (KeyError, TypeError):
|
|
188
|
+
# Skip endpoints that can't be parsed
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
return {
|
|
192
|
+
"version": CACHE_VERSION,
|
|
193
|
+
"updated_at": datetime.now(UTC).isoformat(),
|
|
194
|
+
"endpoints": endpoints,
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def save_cache(data: dict[str, Any]) -> Path:
|
|
199
|
+
"""Save schema cache to disk.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Path where the cache was saved.
|
|
203
|
+
"""
|
|
204
|
+
from anysite.config.paths import ensure_config_dir
|
|
205
|
+
|
|
206
|
+
ensure_config_dir()
|
|
207
|
+
cache_path = get_cache_path()
|
|
208
|
+
cache_path.write_text(json.dumps(data, indent=2))
|
|
209
|
+
return cache_path
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def load_cache() -> dict[str, Any] | None:
|
|
213
|
+
"""Load schema cache from disk.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Cached data dict, or None if cache doesn't exist.
|
|
217
|
+
"""
|
|
218
|
+
cache_path = get_cache_path()
|
|
219
|
+
if not cache_path.exists():
|
|
220
|
+
return None
|
|
221
|
+
try:
|
|
222
|
+
return json.loads(cache_path.read_text()) # type: ignore[no-any-return]
|
|
223
|
+
except (json.JSONDecodeError, OSError):
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _normalize_command(command: str) -> str:
|
|
228
|
+
"""Convert shorthand command to API path.
|
|
229
|
+
|
|
230
|
+
Examples:
|
|
231
|
+
'linkedin.user' -> '/api/linkedin/user'
|
|
232
|
+
'/api/linkedin/user' -> '/api/linkedin/user'
|
|
233
|
+
'linkedin.search-users' -> '/api/linkedin/search/users'
|
|
234
|
+
'linkedin.company-employees' -> '/api/linkedin/company/employees'
|
|
235
|
+
"""
|
|
236
|
+
if command.startswith("/"):
|
|
237
|
+
return command
|
|
238
|
+
# linkedin.search-users -> /api/linkedin/search/users
|
|
239
|
+
parts = command.replace(".", "/").replace("-", "/")
|
|
240
|
+
return f"/api/{parts}"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def get_schema(command: str) -> dict[str, Any] | None:
|
|
244
|
+
"""Get schema info for an endpoint.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
command: Command identifier like 'linkedin.user' or '/api/linkedin/user'.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
Dict with description, tags, input, output — or None if not found.
|
|
251
|
+
"""
|
|
252
|
+
cache = load_cache()
|
|
253
|
+
if cache is None:
|
|
254
|
+
return None
|
|
255
|
+
|
|
256
|
+
endpoints = cache.get("endpoints", {})
|
|
257
|
+
path = _normalize_command(command)
|
|
258
|
+
|
|
259
|
+
if path in endpoints:
|
|
260
|
+
return endpoints[path] # type: ignore[no-any-return]
|
|
261
|
+
|
|
262
|
+
# Try fuzzy match: look for path ending
|
|
263
|
+
for ep_path, ep_data in endpoints.items():
|
|
264
|
+
if ep_path.endswith(path.lstrip("/")):
|
|
265
|
+
return ep_data # type: ignore[no-any-return]
|
|
266
|
+
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def search_endpoints(query: str) -> list[dict[str, Any]]:
|
|
271
|
+
"""Search endpoints by keyword in path, description, or tags.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
query: Search string.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
List of dicts with path, description, tags.
|
|
278
|
+
"""
|
|
279
|
+
cache = load_cache()
|
|
280
|
+
if cache is None:
|
|
281
|
+
return []
|
|
282
|
+
|
|
283
|
+
query_lower = query.lower()
|
|
284
|
+
results = []
|
|
285
|
+
|
|
286
|
+
for path, info in cache.get("endpoints", {}).items():
|
|
287
|
+
description = info.get("description", "")
|
|
288
|
+
tags = " ".join(info.get("tags", []))
|
|
289
|
+
searchable = f"{path} {description} {tags}".lower()
|
|
290
|
+
|
|
291
|
+
if query_lower in searchable:
|
|
292
|
+
results.append({
|
|
293
|
+
"path": path,
|
|
294
|
+
"description": description,
|
|
295
|
+
"tags": info.get("tags", []),
|
|
296
|
+
})
|
|
297
|
+
|
|
298
|
+
return results
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def list_endpoints() -> list[dict[str, Any]]:
|
|
302
|
+
"""List all cached endpoints.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
List of dicts with path and description.
|
|
306
|
+
"""
|
|
307
|
+
cache = load_cache()
|
|
308
|
+
if cache is None:
|
|
309
|
+
return []
|
|
310
|
+
|
|
311
|
+
return [
|
|
312
|
+
{"path": path, "description": info.get("description", "")}
|
|
313
|
+
for path, info in cache.get("endpoints", {}).items()
|
|
314
|
+
]
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def convert_value(value: str, type_hint: str) -> str | int | bool | float:
|
|
318
|
+
"""Convert a string value to the type specified in the schema.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
value: Raw string value from CLI key=value arg.
|
|
322
|
+
type_hint: Type from schema ('integer', 'boolean', 'number', 'string', etc.)
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Converted value.
|
|
326
|
+
"""
|
|
327
|
+
if type_hint == "integer":
|
|
328
|
+
return int(value)
|
|
329
|
+
if type_hint == "boolean":
|
|
330
|
+
return value.lower() in ("true", "1", "yes")
|
|
331
|
+
if type_hint == "number":
|
|
332
|
+
return float(value)
|
|
333
|
+
return value
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Batch processing modules."""
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""Batch executor for running commands across multiple inputs."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Awaitable, Callable
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from anysite.batch.rate_limiter import RateLimiter
|
|
9
|
+
from anysite.cli.options import ErrorHandling
|
|
10
|
+
from anysite.output.console import print_warning
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BatchResult:
|
|
14
|
+
"""Result of a batch execution."""
|
|
15
|
+
|
|
16
|
+
def __init__(self) -> None:
|
|
17
|
+
self.results: list[dict[str, Any]] = []
|
|
18
|
+
self.errors: list[dict[str, Any]] = []
|
|
19
|
+
self.total: int = 0
|
|
20
|
+
self.succeeded: int = 0
|
|
21
|
+
self.failed: int = 0
|
|
22
|
+
self.skipped: int = 0
|
|
23
|
+
self.start_time: float = 0
|
|
24
|
+
self.end_time: float = 0
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def elapsed(self) -> float:
|
|
28
|
+
"""Elapsed time in seconds."""
|
|
29
|
+
return self.end_time - self.start_time
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def rate(self) -> float:
|
|
33
|
+
"""Records per second."""
|
|
34
|
+
if self.elapsed > 0:
|
|
35
|
+
return self.succeeded / self.elapsed
|
|
36
|
+
return 0
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BatchExecutor:
|
|
40
|
+
"""Execute a command function across multiple inputs.
|
|
41
|
+
|
|
42
|
+
Supports sequential and parallel execution with rate limiting,
|
|
43
|
+
delay between requests, and configurable error handling.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
func: Callable[[str | dict[str, Any]], Awaitable[Any]],
|
|
49
|
+
parallel: int = 1,
|
|
50
|
+
delay: float = 0.0,
|
|
51
|
+
on_error: ErrorHandling = ErrorHandling.STOP,
|
|
52
|
+
rate_limiter: RateLimiter | None = None,
|
|
53
|
+
progress_callback: Callable[[int], None] | None = None,
|
|
54
|
+
) -> None:
|
|
55
|
+
"""Initialize batch executor.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
func: Async function to call for each input
|
|
59
|
+
parallel: Number of concurrent requests
|
|
60
|
+
delay: Delay between sequential requests (seconds)
|
|
61
|
+
on_error: Error handling mode
|
|
62
|
+
rate_limiter: Optional rate limiter
|
|
63
|
+
progress_callback: Optional callback for progress updates
|
|
64
|
+
"""
|
|
65
|
+
self.func = func
|
|
66
|
+
self.parallel = max(1, parallel)
|
|
67
|
+
self.delay = delay
|
|
68
|
+
self.on_error = on_error
|
|
69
|
+
self.rate_limiter = rate_limiter
|
|
70
|
+
self.progress_callback = progress_callback
|
|
71
|
+
|
|
72
|
+
async def execute(self, inputs: list[str | dict[str, Any]]) -> BatchResult:
|
|
73
|
+
"""Execute the function for all inputs.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
inputs: List of inputs to process
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
BatchResult with results and statistics
|
|
80
|
+
"""
|
|
81
|
+
result = BatchResult()
|
|
82
|
+
result.total = len(inputs)
|
|
83
|
+
result.start_time = time.monotonic()
|
|
84
|
+
|
|
85
|
+
if self.parallel > 1:
|
|
86
|
+
await self._execute_parallel(inputs, result)
|
|
87
|
+
else:
|
|
88
|
+
await self._execute_sequential(inputs, result)
|
|
89
|
+
|
|
90
|
+
result.end_time = time.monotonic()
|
|
91
|
+
return result
|
|
92
|
+
|
|
93
|
+
async def _execute_sequential(
|
|
94
|
+
self,
|
|
95
|
+
inputs: list[str | dict[str, Any]],
|
|
96
|
+
result: BatchResult,
|
|
97
|
+
) -> None:
|
|
98
|
+
"""Execute inputs one at a time."""
|
|
99
|
+
for i, inp in enumerate(inputs):
|
|
100
|
+
try:
|
|
101
|
+
if self.rate_limiter:
|
|
102
|
+
await self.rate_limiter.acquire()
|
|
103
|
+
|
|
104
|
+
data = await self.func(inp)
|
|
105
|
+
result.results.append(data if isinstance(data, dict) else {"data": data})
|
|
106
|
+
result.succeeded += 1
|
|
107
|
+
|
|
108
|
+
if self.progress_callback:
|
|
109
|
+
self.progress_callback(1)
|
|
110
|
+
|
|
111
|
+
if self.delay > 0 and i < len(inputs) - 1:
|
|
112
|
+
await asyncio.sleep(self.delay)
|
|
113
|
+
|
|
114
|
+
except Exception as e:
|
|
115
|
+
result.failed += 1
|
|
116
|
+
error_info = {"input": str(inp), "error": str(e)}
|
|
117
|
+
result.errors.append(error_info)
|
|
118
|
+
|
|
119
|
+
if self.on_error == ErrorHandling.STOP:
|
|
120
|
+
raise
|
|
121
|
+
elif self.on_error == ErrorHandling.SKIP:
|
|
122
|
+
print_warning(f"Skipping '{inp}': {e}")
|
|
123
|
+
result.skipped += 1
|
|
124
|
+
if self.progress_callback:
|
|
125
|
+
self.progress_callback(1)
|
|
126
|
+
|
|
127
|
+
async def _execute_parallel(
|
|
128
|
+
self,
|
|
129
|
+
inputs: list[str | dict[str, Any]],
|
|
130
|
+
result: BatchResult,
|
|
131
|
+
) -> None:
|
|
132
|
+
"""Execute inputs with limited concurrency."""
|
|
133
|
+
semaphore = asyncio.Semaphore(self.parallel)
|
|
134
|
+
|
|
135
|
+
async def _process(inp: str | dict[str, Any]) -> None:
|
|
136
|
+
async with semaphore:
|
|
137
|
+
try:
|
|
138
|
+
if self.rate_limiter:
|
|
139
|
+
await self.rate_limiter.acquire()
|
|
140
|
+
|
|
141
|
+
data = await self.func(inp)
|
|
142
|
+
result.results.append(
|
|
143
|
+
data if isinstance(data, dict) else {"data": data}
|
|
144
|
+
)
|
|
145
|
+
result.succeeded += 1
|
|
146
|
+
|
|
147
|
+
if self.progress_callback:
|
|
148
|
+
self.progress_callback(1)
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
result.failed += 1
|
|
152
|
+
error_info = {"input": str(inp), "error": str(e)}
|
|
153
|
+
result.errors.append(error_info)
|
|
154
|
+
|
|
155
|
+
if self.on_error == ErrorHandling.STOP:
|
|
156
|
+
raise
|
|
157
|
+
elif self.on_error == ErrorHandling.SKIP:
|
|
158
|
+
print_warning(f"Skipping '{inp}': {e}")
|
|
159
|
+
result.skipped += 1
|
|
160
|
+
if self.progress_callback:
|
|
161
|
+
self.progress_callback(1)
|
|
162
|
+
|
|
163
|
+
tasks = [asyncio.create_task(_process(inp)) for inp in inputs]
|
|
164
|
+
|
|
165
|
+
if self.on_error == ErrorHandling.STOP:
|
|
166
|
+
# If any task fails, cancel the rest
|
|
167
|
+
try:
|
|
168
|
+
await asyncio.gather(*tasks)
|
|
169
|
+
except Exception:
|
|
170
|
+
for task in tasks:
|
|
171
|
+
if not task.done():
|
|
172
|
+
task.cancel()
|
|
173
|
+
raise
|
|
174
|
+
else:
|
|
175
|
+
# Let all tasks complete even if some fail
|
|
176
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
anysite/batch/input.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Batch input parser for reading inputs from files and stdin."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import io
|
|
5
|
+
import sys
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import orjson
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InputFormat(str, Enum):
|
|
14
|
+
"""Supported input file formats."""
|
|
15
|
+
|
|
16
|
+
TEXT = "text"
|
|
17
|
+
JSONL = "jsonl"
|
|
18
|
+
CSV = "csv"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class InputParser:
|
|
22
|
+
"""Parse batch inputs from various sources."""
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def from_file(path: Path) -> "list[str | dict[str, Any]]":
|
|
26
|
+
"""Load inputs from a file, auto-detecting format.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
path: Path to input file
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
List of inputs (strings for text, dicts for JSONL/CSV)
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
FileNotFoundError: If file doesn't exist
|
|
36
|
+
ValueError: If file is empty or format is unsupported
|
|
37
|
+
"""
|
|
38
|
+
if not path.exists():
|
|
39
|
+
raise FileNotFoundError(f"Input file not found: {path}")
|
|
40
|
+
|
|
41
|
+
content = path.read_text(encoding="utf-8").strip()
|
|
42
|
+
if not content:
|
|
43
|
+
raise ValueError(f"Input file is empty: {path}")
|
|
44
|
+
|
|
45
|
+
fmt = InputParser.detect_format(path)
|
|
46
|
+
|
|
47
|
+
results: list[str | dict[str, Any]]
|
|
48
|
+
if fmt == InputFormat.JSONL:
|
|
49
|
+
results = InputParser.parse_jsonl(content) # type: ignore[assignment]
|
|
50
|
+
elif fmt == InputFormat.CSV:
|
|
51
|
+
results = InputParser.parse_csv(content) # type: ignore[assignment]
|
|
52
|
+
else:
|
|
53
|
+
results = InputParser.parse_text(content) # type: ignore[assignment]
|
|
54
|
+
return results
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def from_stdin() -> "list[str | dict[str, Any]]":
|
|
58
|
+
"""Read inputs from stdin.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
List of inputs (strings, one per line)
|
|
62
|
+
|
|
63
|
+
Raises:
|
|
64
|
+
ValueError: If stdin is empty or is a TTY
|
|
65
|
+
"""
|
|
66
|
+
if sys.stdin.isatty():
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"No input detected on stdin. "
|
|
69
|
+
"Pipe data or use --from-file instead."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
content = sys.stdin.read().strip()
|
|
73
|
+
if not content:
|
|
74
|
+
raise ValueError("No input received from stdin.")
|
|
75
|
+
|
|
76
|
+
# Try to detect format
|
|
77
|
+
first_line = content.split("\n")[0].strip()
|
|
78
|
+
if first_line.startswith("{"):
|
|
79
|
+
return InputParser.parse_jsonl(content) # type: ignore[return-value]
|
|
80
|
+
return InputParser.parse_text(content) # type: ignore[return-value]
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def detect_format(path: Path) -> InputFormat:
|
|
84
|
+
"""Detect input file format from extension.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
path: Path to input file
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Detected InputFormat
|
|
91
|
+
"""
|
|
92
|
+
suffix = path.suffix.lower()
|
|
93
|
+
|
|
94
|
+
if suffix in (".jsonl", ".ndjson"):
|
|
95
|
+
return InputFormat.JSONL
|
|
96
|
+
elif suffix == ".csv":
|
|
97
|
+
return InputFormat.CSV
|
|
98
|
+
elif suffix in (".json",):
|
|
99
|
+
# Check if it's actually JSONL (one JSON per line)
|
|
100
|
+
content = path.read_text(encoding="utf-8").strip()
|
|
101
|
+
lines = content.split("\n")
|
|
102
|
+
if len(lines) > 1 and lines[0].strip().startswith("{"):
|
|
103
|
+
return InputFormat.JSONL
|
|
104
|
+
return InputFormat.TEXT
|
|
105
|
+
else:
|
|
106
|
+
return InputFormat.TEXT
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
def parse_text(content: str) -> list[str]:
|
|
110
|
+
"""Parse plain text input (one value per line).
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
content: Raw text content
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
List of non-empty stripped lines
|
|
117
|
+
"""
|
|
118
|
+
lines = content.strip().split("\n")
|
|
119
|
+
return [line.strip() for line in lines if line.strip()]
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def parse_jsonl(content: str) -> list[dict[str, Any]]:
|
|
123
|
+
"""Parse JSON Lines input.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
content: JSONL content
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
List of parsed dictionaries
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
ValueError: If a line contains invalid JSON
|
|
133
|
+
"""
|
|
134
|
+
results: list[dict[str, Any]] = []
|
|
135
|
+
for i, line in enumerate(content.strip().split("\n"), 1):
|
|
136
|
+
line = line.strip()
|
|
137
|
+
if not line:
|
|
138
|
+
continue
|
|
139
|
+
try:
|
|
140
|
+
parsed = orjson.loads(line)
|
|
141
|
+
if isinstance(parsed, dict):
|
|
142
|
+
results.append(parsed)
|
|
143
|
+
else:
|
|
144
|
+
results.append({"value": parsed})
|
|
145
|
+
except orjson.JSONDecodeError as e:
|
|
146
|
+
raise ValueError(f"Invalid JSON on line {i}: {e}") from e
|
|
147
|
+
return results
|
|
148
|
+
|
|
149
|
+
@staticmethod
|
|
150
|
+
def parse_csv(content: str) -> list[dict[str, Any]]:
|
|
151
|
+
"""Parse CSV input with headers.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
content: CSV content with header row
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
List of dictionaries (one per row)
|
|
158
|
+
"""
|
|
159
|
+
reader = csv.DictReader(io.StringIO(content))
|
|
160
|
+
return [dict(row) for row in reader]
|