anysite-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of anysite-cli might be problematic. Click here for more details.

Files changed (64) hide show
  1. anysite/__init__.py +4 -0
  2. anysite/__main__.py +6 -0
  3. anysite/api/__init__.py +21 -0
  4. anysite/api/client.py +271 -0
  5. anysite/api/errors.py +137 -0
  6. anysite/api/schemas.py +333 -0
  7. anysite/batch/__init__.py +1 -0
  8. anysite/batch/executor.py +176 -0
  9. anysite/batch/input.py +160 -0
  10. anysite/batch/rate_limiter.py +98 -0
  11. anysite/cli/__init__.py +1 -0
  12. anysite/cli/config.py +176 -0
  13. anysite/cli/executor.py +388 -0
  14. anysite/cli/options.py +249 -0
  15. anysite/config/__init__.py +11 -0
  16. anysite/config/paths.py +46 -0
  17. anysite/config/settings.py +187 -0
  18. anysite/dataset/__init__.py +37 -0
  19. anysite/dataset/analyzer.py +268 -0
  20. anysite/dataset/cli.py +644 -0
  21. anysite/dataset/collector.py +686 -0
  22. anysite/dataset/db_loader.py +248 -0
  23. anysite/dataset/errors.py +30 -0
  24. anysite/dataset/exporters.py +121 -0
  25. anysite/dataset/history.py +153 -0
  26. anysite/dataset/models.py +245 -0
  27. anysite/dataset/notifications.py +87 -0
  28. anysite/dataset/scheduler.py +107 -0
  29. anysite/dataset/storage.py +171 -0
  30. anysite/dataset/transformer.py +213 -0
  31. anysite/db/__init__.py +38 -0
  32. anysite/db/adapters/__init__.py +1 -0
  33. anysite/db/adapters/base.py +158 -0
  34. anysite/db/adapters/postgres.py +201 -0
  35. anysite/db/adapters/sqlite.py +183 -0
  36. anysite/db/cli.py +687 -0
  37. anysite/db/config.py +92 -0
  38. anysite/db/manager.py +166 -0
  39. anysite/db/operations/__init__.py +1 -0
  40. anysite/db/operations/insert.py +199 -0
  41. anysite/db/operations/query.py +43 -0
  42. anysite/db/schema/__init__.py +1 -0
  43. anysite/db/schema/inference.py +213 -0
  44. anysite/db/schema/types.py +71 -0
  45. anysite/db/utils/__init__.py +1 -0
  46. anysite/db/utils/sanitize.py +99 -0
  47. anysite/main.py +498 -0
  48. anysite/models/__init__.py +1 -0
  49. anysite/output/__init__.py +11 -0
  50. anysite/output/console.py +45 -0
  51. anysite/output/formatters.py +301 -0
  52. anysite/output/templates.py +76 -0
  53. anysite/py.typed +0 -0
  54. anysite/streaming/__init__.py +1 -0
  55. anysite/streaming/progress.py +121 -0
  56. anysite/streaming/writer.py +130 -0
  57. anysite/utils/__init__.py +1 -0
  58. anysite/utils/fields.py +242 -0
  59. anysite/utils/retry.py +109 -0
  60. anysite_cli-0.1.0.dist-info/METADATA +437 -0
  61. anysite_cli-0.1.0.dist-info/RECORD +64 -0
  62. anysite_cli-0.1.0.dist-info/WHEEL +4 -0
  63. anysite_cli-0.1.0.dist-info/entry_points.txt +2 -0
  64. anysite_cli-0.1.0.dist-info/licenses/LICENSE +21 -0
anysite/api/schemas.py ADDED
@@ -0,0 +1,333 @@
1
+ """Dynamic OpenAPI schema cache for endpoint discovery.
2
+
3
+ Fetches the OpenAPI spec from the API, resolves $ref references,
4
+ and caches a compact representation of all endpoints with their
5
+ input parameters and output fields.
6
+
7
+ Used by `anysite describe` and `anysite schema update` commands.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ from datetime import UTC, datetime
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ import httpx
18
+
19
+ OPENAPI_URL = "https://api.anysite.io/openapi.json"
20
+ CACHE_VERSION = "0.0.1"
21
+
22
+
23
+ def get_cache_path() -> Path:
24
+ """Get the schema cache file path."""
25
+ from anysite.config.paths import get_schema_cache_path
26
+
27
+ return get_schema_cache_path()
28
+
29
+
30
+ def resolve_ref(spec: dict[str, Any], ref_path: str) -> dict[str, Any]:
31
+ """Resolve a $ref path within the OpenAPI spec.
32
+
33
+ Args:
34
+ spec: Full OpenAPI spec dict.
35
+ ref_path: Reference string like '#/components/schemas/UserProfile'.
36
+
37
+ Returns:
38
+ The resolved schema dict.
39
+ """
40
+ parts = ref_path.lstrip("#/").split("/")
41
+ node: Any = spec
42
+ for part in parts:
43
+ node = node[part]
44
+ return node # type: ignore[no-any-return]
45
+
46
+
47
+ def _resolve_schema(spec: dict[str, Any], schema: dict[str, Any]) -> dict[str, Any]:
48
+ """Recursively resolve all $ref in a schema."""
49
+ if "$ref" in schema:
50
+ resolved = resolve_ref(spec, schema["$ref"])
51
+ return _resolve_schema(spec, resolved)
52
+
53
+ if "allOf" in schema:
54
+ merged: dict[str, Any] = {}
55
+ for sub in schema["allOf"]:
56
+ resolved_sub = _resolve_schema(spec, sub)
57
+ merged.update(resolved_sub.get("properties", {}))
58
+ return {"type": "object", "properties": merged}
59
+
60
+ if "anyOf" in schema or "oneOf" in schema:
61
+ variants = schema.get("anyOf") or schema.get("oneOf", [])
62
+ # Pick first non-null variant
63
+ for variant in variants:
64
+ resolved_v = _resolve_schema(spec, variant)
65
+ if resolved_v.get("type") != "null":
66
+ return resolved_v
67
+ return variants[0] if variants else schema
68
+
69
+ return schema
70
+
71
+
72
+ def _simplify_type(schema: dict[str, Any]) -> str:
73
+ """Convert a resolved schema to a simple type string."""
74
+ if "anyOf" in schema or "oneOf" in schema:
75
+ variants = schema.get("anyOf") or schema.get("oneOf", [])
76
+ types = []
77
+ for v in variants:
78
+ t = v.get("type", "object")
79
+ if t != "null":
80
+ types.append(t)
81
+ return types[0] if types else "object"
82
+
83
+ return schema.get("type", "object")
84
+
85
+
86
+ def _extract_properties(spec: dict[str, Any], schema: dict[str, Any]) -> dict[str, str]:
87
+ """Extract flat field -> type mapping from a resolved schema."""
88
+ schema = _resolve_schema(spec, schema)
89
+
90
+ # Handle array of objects (most API responses)
91
+ if schema.get("type") == "array" and "items" in schema:
92
+ schema = _resolve_schema(spec, schema["items"])
93
+
94
+ properties = schema.get("properties", {})
95
+ result: dict[str, str] = {}
96
+ for field_name, field_schema in properties.items():
97
+ resolved_field = _resolve_schema(spec, field_schema)
98
+ result[field_name] = _simplify_type(resolved_field)
99
+ return result
100
+
101
+
102
+ def _extract_input(spec: dict[str, Any], method_data: dict[str, Any]) -> dict[str, dict[str, Any]]:
103
+ """Extract input parameters from request body schema."""
104
+ request_body = method_data.get("requestBody", {})
105
+ content = request_body.get("content", {})
106
+ json_content = content.get("application/json", {})
107
+ body_schema = json_content.get("schema", {})
108
+
109
+ if not body_schema:
110
+ return {}
111
+
112
+ resolved = _resolve_schema(spec, body_schema)
113
+ properties = resolved.get("properties", {})
114
+ required_fields = set(resolved.get("required", []))
115
+
116
+ result: dict[str, dict[str, Any]] = {}
117
+ for field_name, field_schema in properties.items():
118
+ resolved_field = _resolve_schema(spec, field_schema)
119
+ result[field_name] = {
120
+ "type": _simplify_type(resolved_field),
121
+ "required": field_name in required_fields,
122
+ "description": resolved_field.get("description", ""),
123
+ }
124
+ return result
125
+
126
+
127
+ def extract_endpoint_info(
128
+ spec: dict[str, Any], method_data: dict[str, Any]
129
+ ) -> dict[str, Any]:
130
+ """Extract input/output info for a single endpoint.
131
+
132
+ Args:
133
+ spec: Full OpenAPI spec.
134
+ method_data: The method dict (e.g. spec['paths'][path]['post']).
135
+
136
+ Returns:
137
+ Dict with description, tags, input, output.
138
+ """
139
+ description = method_data.get("description", "") or method_data.get("summary", "")
140
+ tags = method_data.get("tags", [])
141
+
142
+ # Extract input params
143
+ input_params = _extract_input(spec, method_data)
144
+
145
+ # Extract output fields from 200 response
146
+ responses = method_data.get("responses", {})
147
+ success_resp = responses.get("200", responses.get("201", {}))
148
+ resp_content = success_resp.get("content", {})
149
+ resp_json = resp_content.get("application/json", {})
150
+ resp_schema = resp_json.get("schema", {})
151
+
152
+ output_fields = _extract_properties(spec, resp_schema) if resp_schema else {}
153
+
154
+ return {
155
+ "description": description,
156
+ "tags": tags,
157
+ "input": input_params,
158
+ "output": output_fields,
159
+ }
160
+
161
+
162
+ def fetch_and_parse_openapi(url: str | None = None) -> dict[str, Any]:
163
+ """Fetch the OpenAPI spec and parse all endpoints into a compact cache format.
164
+
165
+ Args:
166
+ url: OpenAPI spec URL. Defaults to OPENAPI_URL.
167
+
168
+ Returns:
169
+ Cache dict with version, updated_at, and endpoints.
170
+ """
171
+ spec_url = url or OPENAPI_URL
172
+ response = httpx.get(spec_url, timeout=30, follow_redirects=True)
173
+ response.raise_for_status()
174
+ spec = response.json()
175
+
176
+ endpoints: dict[str, Any] = {}
177
+
178
+ for path, path_data in spec.get("paths", {}).items():
179
+ # Only process POST endpoints (API pattern)
180
+ for method in ("post", "get"):
181
+ if method not in path_data:
182
+ continue
183
+ method_data = path_data[method]
184
+ try:
185
+ info = extract_endpoint_info(spec, method_data)
186
+ endpoints[path] = info
187
+ except (KeyError, TypeError):
188
+ # Skip endpoints that can't be parsed
189
+ continue
190
+
191
+ return {
192
+ "version": CACHE_VERSION,
193
+ "updated_at": datetime.now(UTC).isoformat(),
194
+ "endpoints": endpoints,
195
+ }
196
+
197
+
198
+ def save_cache(data: dict[str, Any]) -> Path:
199
+ """Save schema cache to disk.
200
+
201
+ Returns:
202
+ Path where the cache was saved.
203
+ """
204
+ from anysite.config.paths import ensure_config_dir
205
+
206
+ ensure_config_dir()
207
+ cache_path = get_cache_path()
208
+ cache_path.write_text(json.dumps(data, indent=2))
209
+ return cache_path
210
+
211
+
212
+ def load_cache() -> dict[str, Any] | None:
213
+ """Load schema cache from disk.
214
+
215
+ Returns:
216
+ Cached data dict, or None if cache doesn't exist.
217
+ """
218
+ cache_path = get_cache_path()
219
+ if not cache_path.exists():
220
+ return None
221
+ try:
222
+ return json.loads(cache_path.read_text()) # type: ignore[no-any-return]
223
+ except (json.JSONDecodeError, OSError):
224
+ return None
225
+
226
+
227
+ def _normalize_command(command: str) -> str:
228
+ """Convert shorthand command to API path.
229
+
230
+ Examples:
231
+ 'linkedin.user' -> '/api/linkedin/user'
232
+ '/api/linkedin/user' -> '/api/linkedin/user'
233
+ 'linkedin.search-users' -> '/api/linkedin/search/users'
234
+ 'linkedin.company-employees' -> '/api/linkedin/company/employees'
235
+ """
236
+ if command.startswith("/"):
237
+ return command
238
+ # linkedin.search-users -> /api/linkedin/search/users
239
+ parts = command.replace(".", "/").replace("-", "/")
240
+ return f"/api/{parts}"
241
+
242
+
243
+ def get_schema(command: str) -> dict[str, Any] | None:
244
+ """Get schema info for an endpoint.
245
+
246
+ Args:
247
+ command: Command identifier like 'linkedin.user' or '/api/linkedin/user'.
248
+
249
+ Returns:
250
+ Dict with description, tags, input, output — or None if not found.
251
+ """
252
+ cache = load_cache()
253
+ if cache is None:
254
+ return None
255
+
256
+ endpoints = cache.get("endpoints", {})
257
+ path = _normalize_command(command)
258
+
259
+ if path in endpoints:
260
+ return endpoints[path] # type: ignore[no-any-return]
261
+
262
+ # Try fuzzy match: look for path ending
263
+ for ep_path, ep_data in endpoints.items():
264
+ if ep_path.endswith(path.lstrip("/")):
265
+ return ep_data # type: ignore[no-any-return]
266
+
267
+ return None
268
+
269
+
270
+ def search_endpoints(query: str) -> list[dict[str, Any]]:
271
+ """Search endpoints by keyword in path, description, or tags.
272
+
273
+ Args:
274
+ query: Search string.
275
+
276
+ Returns:
277
+ List of dicts with path, description, tags.
278
+ """
279
+ cache = load_cache()
280
+ if cache is None:
281
+ return []
282
+
283
+ query_lower = query.lower()
284
+ results = []
285
+
286
+ for path, info in cache.get("endpoints", {}).items():
287
+ description = info.get("description", "")
288
+ tags = " ".join(info.get("tags", []))
289
+ searchable = f"{path} {description} {tags}".lower()
290
+
291
+ if query_lower in searchable:
292
+ results.append({
293
+ "path": path,
294
+ "description": description,
295
+ "tags": info.get("tags", []),
296
+ })
297
+
298
+ return results
299
+
300
+
301
+ def list_endpoints() -> list[dict[str, Any]]:
302
+ """List all cached endpoints.
303
+
304
+ Returns:
305
+ List of dicts with path and description.
306
+ """
307
+ cache = load_cache()
308
+ if cache is None:
309
+ return []
310
+
311
+ return [
312
+ {"path": path, "description": info.get("description", "")}
313
+ for path, info in cache.get("endpoints", {}).items()
314
+ ]
315
+
316
+
317
+ def convert_value(value: str, type_hint: str) -> str | int | bool | float:
318
+ """Convert a string value to the type specified in the schema.
319
+
320
+ Args:
321
+ value: Raw string value from CLI key=value arg.
322
+ type_hint: Type from schema ('integer', 'boolean', 'number', 'string', etc.)
323
+
324
+ Returns:
325
+ Converted value.
326
+ """
327
+ if type_hint == "integer":
328
+ return int(value)
329
+ if type_hint == "boolean":
330
+ return value.lower() in ("true", "1", "yes")
331
+ if type_hint == "number":
332
+ return float(value)
333
+ return value
@@ -0,0 +1 @@
1
+ """Batch processing modules."""
@@ -0,0 +1,176 @@
1
+ """Batch executor for running commands across multiple inputs."""
2
+
3
+ import asyncio
4
+ import time
5
+ from collections.abc import Awaitable, Callable
6
+ from typing import Any
7
+
8
+ from anysite.batch.rate_limiter import RateLimiter
9
+ from anysite.cli.options import ErrorHandling
10
+ from anysite.output.console import print_warning
11
+
12
+
13
+ class BatchResult:
14
+ """Result of a batch execution."""
15
+
16
+ def __init__(self) -> None:
17
+ self.results: list[dict[str, Any]] = []
18
+ self.errors: list[dict[str, Any]] = []
19
+ self.total: int = 0
20
+ self.succeeded: int = 0
21
+ self.failed: int = 0
22
+ self.skipped: int = 0
23
+ self.start_time: float = 0
24
+ self.end_time: float = 0
25
+
26
+ @property
27
+ def elapsed(self) -> float:
28
+ """Elapsed time in seconds."""
29
+ return self.end_time - self.start_time
30
+
31
+ @property
32
+ def rate(self) -> float:
33
+ """Records per second."""
34
+ if self.elapsed > 0:
35
+ return self.succeeded / self.elapsed
36
+ return 0
37
+
38
+
39
+ class BatchExecutor:
40
+ """Execute a command function across multiple inputs.
41
+
42
+ Supports sequential and parallel execution with rate limiting,
43
+ delay between requests, and configurable error handling.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ func: Callable[[str | dict[str, Any]], Awaitable[Any]],
49
+ parallel: int = 1,
50
+ delay: float = 0.0,
51
+ on_error: ErrorHandling = ErrorHandling.STOP,
52
+ rate_limiter: RateLimiter | None = None,
53
+ progress_callback: Callable[[int], None] | None = None,
54
+ ) -> None:
55
+ """Initialize batch executor.
56
+
57
+ Args:
58
+ func: Async function to call for each input
59
+ parallel: Number of concurrent requests
60
+ delay: Delay between sequential requests (seconds)
61
+ on_error: Error handling mode
62
+ rate_limiter: Optional rate limiter
63
+ progress_callback: Optional callback for progress updates
64
+ """
65
+ self.func = func
66
+ self.parallel = max(1, parallel)
67
+ self.delay = delay
68
+ self.on_error = on_error
69
+ self.rate_limiter = rate_limiter
70
+ self.progress_callback = progress_callback
71
+
72
+ async def execute(self, inputs: list[str | dict[str, Any]]) -> BatchResult:
73
+ """Execute the function for all inputs.
74
+
75
+ Args:
76
+ inputs: List of inputs to process
77
+
78
+ Returns:
79
+ BatchResult with results and statistics
80
+ """
81
+ result = BatchResult()
82
+ result.total = len(inputs)
83
+ result.start_time = time.monotonic()
84
+
85
+ if self.parallel > 1:
86
+ await self._execute_parallel(inputs, result)
87
+ else:
88
+ await self._execute_sequential(inputs, result)
89
+
90
+ result.end_time = time.monotonic()
91
+ return result
92
+
93
+ async def _execute_sequential(
94
+ self,
95
+ inputs: list[str | dict[str, Any]],
96
+ result: BatchResult,
97
+ ) -> None:
98
+ """Execute inputs one at a time."""
99
+ for i, inp in enumerate(inputs):
100
+ try:
101
+ if self.rate_limiter:
102
+ await self.rate_limiter.acquire()
103
+
104
+ data = await self.func(inp)
105
+ result.results.append(data if isinstance(data, dict) else {"data": data})
106
+ result.succeeded += 1
107
+
108
+ if self.progress_callback:
109
+ self.progress_callback(1)
110
+
111
+ if self.delay > 0 and i < len(inputs) - 1:
112
+ await asyncio.sleep(self.delay)
113
+
114
+ except Exception as e:
115
+ result.failed += 1
116
+ error_info = {"input": str(inp), "error": str(e)}
117
+ result.errors.append(error_info)
118
+
119
+ if self.on_error == ErrorHandling.STOP:
120
+ raise
121
+ elif self.on_error == ErrorHandling.SKIP:
122
+ print_warning(f"Skipping '{inp}': {e}")
123
+ result.skipped += 1
124
+ if self.progress_callback:
125
+ self.progress_callback(1)
126
+
127
+ async def _execute_parallel(
128
+ self,
129
+ inputs: list[str | dict[str, Any]],
130
+ result: BatchResult,
131
+ ) -> None:
132
+ """Execute inputs with limited concurrency."""
133
+ semaphore = asyncio.Semaphore(self.parallel)
134
+
135
+ async def _process(inp: str | dict[str, Any]) -> None:
136
+ async with semaphore:
137
+ try:
138
+ if self.rate_limiter:
139
+ await self.rate_limiter.acquire()
140
+
141
+ data = await self.func(inp)
142
+ result.results.append(
143
+ data if isinstance(data, dict) else {"data": data}
144
+ )
145
+ result.succeeded += 1
146
+
147
+ if self.progress_callback:
148
+ self.progress_callback(1)
149
+
150
+ except Exception as e:
151
+ result.failed += 1
152
+ error_info = {"input": str(inp), "error": str(e)}
153
+ result.errors.append(error_info)
154
+
155
+ if self.on_error == ErrorHandling.STOP:
156
+ raise
157
+ elif self.on_error == ErrorHandling.SKIP:
158
+ print_warning(f"Skipping '{inp}': {e}")
159
+ result.skipped += 1
160
+ if self.progress_callback:
161
+ self.progress_callback(1)
162
+
163
+ tasks = [asyncio.create_task(_process(inp)) for inp in inputs]
164
+
165
+ if self.on_error == ErrorHandling.STOP:
166
+ # If any task fails, cancel the rest
167
+ try:
168
+ await asyncio.gather(*tasks)
169
+ except Exception:
170
+ for task in tasks:
171
+ if not task.done():
172
+ task.cancel()
173
+ raise
174
+ else:
175
+ # Let all tasks complete even if some fail
176
+ await asyncio.gather(*tasks, return_exceptions=True)
anysite/batch/input.py ADDED
@@ -0,0 +1,160 @@
1
+ """Batch input parser for reading inputs from files and stdin."""
2
+
3
+ import csv
4
+ import io
5
+ import sys
6
+ from enum import Enum
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import orjson
11
+
12
+
13
+ class InputFormat(str, Enum):
14
+ """Supported input file formats."""
15
+
16
+ TEXT = "text"
17
+ JSONL = "jsonl"
18
+ CSV = "csv"
19
+
20
+
21
+ class InputParser:
22
+ """Parse batch inputs from various sources."""
23
+
24
+ @staticmethod
25
+ def from_file(path: Path) -> "list[str | dict[str, Any]]":
26
+ """Load inputs from a file, auto-detecting format.
27
+
28
+ Args:
29
+ path: Path to input file
30
+
31
+ Returns:
32
+ List of inputs (strings for text, dicts for JSONL/CSV)
33
+
34
+ Raises:
35
+ FileNotFoundError: If file doesn't exist
36
+ ValueError: If file is empty or format is unsupported
37
+ """
38
+ if not path.exists():
39
+ raise FileNotFoundError(f"Input file not found: {path}")
40
+
41
+ content = path.read_text(encoding="utf-8").strip()
42
+ if not content:
43
+ raise ValueError(f"Input file is empty: {path}")
44
+
45
+ fmt = InputParser.detect_format(path)
46
+
47
+ results: list[str | dict[str, Any]]
48
+ if fmt == InputFormat.JSONL:
49
+ results = InputParser.parse_jsonl(content) # type: ignore[assignment]
50
+ elif fmt == InputFormat.CSV:
51
+ results = InputParser.parse_csv(content) # type: ignore[assignment]
52
+ else:
53
+ results = InputParser.parse_text(content) # type: ignore[assignment]
54
+ return results
55
+
56
+ @staticmethod
57
+ def from_stdin() -> "list[str | dict[str, Any]]":
58
+ """Read inputs from stdin.
59
+
60
+ Returns:
61
+ List of inputs (strings, one per line)
62
+
63
+ Raises:
64
+ ValueError: If stdin is empty or is a TTY
65
+ """
66
+ if sys.stdin.isatty():
67
+ raise ValueError(
68
+ "No input detected on stdin. "
69
+ "Pipe data or use --from-file instead."
70
+ )
71
+
72
+ content = sys.stdin.read().strip()
73
+ if not content:
74
+ raise ValueError("No input received from stdin.")
75
+
76
+ # Try to detect format
77
+ first_line = content.split("\n")[0].strip()
78
+ if first_line.startswith("{"):
79
+ return InputParser.parse_jsonl(content) # type: ignore[return-value]
80
+ return InputParser.parse_text(content) # type: ignore[return-value]
81
+
82
+ @staticmethod
83
+ def detect_format(path: Path) -> InputFormat:
84
+ """Detect input file format from extension.
85
+
86
+ Args:
87
+ path: Path to input file
88
+
89
+ Returns:
90
+ Detected InputFormat
91
+ """
92
+ suffix = path.suffix.lower()
93
+
94
+ if suffix in (".jsonl", ".ndjson"):
95
+ return InputFormat.JSONL
96
+ elif suffix == ".csv":
97
+ return InputFormat.CSV
98
+ elif suffix in (".json",):
99
+ # Check if it's actually JSONL (one JSON per line)
100
+ content = path.read_text(encoding="utf-8").strip()
101
+ lines = content.split("\n")
102
+ if len(lines) > 1 and lines[0].strip().startswith("{"):
103
+ return InputFormat.JSONL
104
+ return InputFormat.TEXT
105
+ else:
106
+ return InputFormat.TEXT
107
+
108
+ @staticmethod
109
+ def parse_text(content: str) -> list[str]:
110
+ """Parse plain text input (one value per line).
111
+
112
+ Args:
113
+ content: Raw text content
114
+
115
+ Returns:
116
+ List of non-empty stripped lines
117
+ """
118
+ lines = content.strip().split("\n")
119
+ return [line.strip() for line in lines if line.strip()]
120
+
121
+ @staticmethod
122
+ def parse_jsonl(content: str) -> list[dict[str, Any]]:
123
+ """Parse JSON Lines input.
124
+
125
+ Args:
126
+ content: JSONL content
127
+
128
+ Returns:
129
+ List of parsed dictionaries
130
+
131
+ Raises:
132
+ ValueError: If a line contains invalid JSON
133
+ """
134
+ results: list[dict[str, Any]] = []
135
+ for i, line in enumerate(content.strip().split("\n"), 1):
136
+ line = line.strip()
137
+ if not line:
138
+ continue
139
+ try:
140
+ parsed = orjson.loads(line)
141
+ if isinstance(parsed, dict):
142
+ results.append(parsed)
143
+ else:
144
+ results.append({"value": parsed})
145
+ except orjson.JSONDecodeError as e:
146
+ raise ValueError(f"Invalid JSON on line {i}: {e}") from e
147
+ return results
148
+
149
+ @staticmethod
150
+ def parse_csv(content: str) -> list[dict[str, Any]]:
151
+ """Parse CSV input with headers.
152
+
153
+ Args:
154
+ content: CSV content with header row
155
+
156
+ Returns:
157
+ List of dictionaries (one per row)
158
+ """
159
+ reader = csv.DictReader(io.StringIO(content))
160
+ return [dict(row) for row in reader]