anysite-cli 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of anysite-cli might be problematic. Click here for more details.
- anysite/__init__.py +4 -0
- anysite/__main__.py +6 -0
- anysite/api/__init__.py +21 -0
- anysite/api/client.py +271 -0
- anysite/api/errors.py +137 -0
- anysite/api/schemas.py +333 -0
- anysite/batch/__init__.py +1 -0
- anysite/batch/executor.py +176 -0
- anysite/batch/input.py +160 -0
- anysite/batch/rate_limiter.py +98 -0
- anysite/cli/__init__.py +1 -0
- anysite/cli/config.py +176 -0
- anysite/cli/executor.py +388 -0
- anysite/cli/options.py +249 -0
- anysite/config/__init__.py +11 -0
- anysite/config/paths.py +46 -0
- anysite/config/settings.py +187 -0
- anysite/dataset/__init__.py +37 -0
- anysite/dataset/analyzer.py +268 -0
- anysite/dataset/cli.py +644 -0
- anysite/dataset/collector.py +686 -0
- anysite/dataset/db_loader.py +248 -0
- anysite/dataset/errors.py +30 -0
- anysite/dataset/exporters.py +121 -0
- anysite/dataset/history.py +153 -0
- anysite/dataset/models.py +245 -0
- anysite/dataset/notifications.py +87 -0
- anysite/dataset/scheduler.py +107 -0
- anysite/dataset/storage.py +171 -0
- anysite/dataset/transformer.py +213 -0
- anysite/db/__init__.py +38 -0
- anysite/db/adapters/__init__.py +1 -0
- anysite/db/adapters/base.py +158 -0
- anysite/db/adapters/postgres.py +201 -0
- anysite/db/adapters/sqlite.py +183 -0
- anysite/db/cli.py +709 -0
- anysite/db/config.py +92 -0
- anysite/db/manager.py +166 -0
- anysite/db/operations/__init__.py +1 -0
- anysite/db/operations/insert.py +199 -0
- anysite/db/operations/query.py +43 -0
- anysite/db/schema/__init__.py +1 -0
- anysite/db/schema/inference.py +213 -0
- anysite/db/schema/types.py +71 -0
- anysite/db/utils/__init__.py +1 -0
- anysite/db/utils/sanitize.py +99 -0
- anysite/main.py +498 -0
- anysite/models/__init__.py +1 -0
- anysite/output/__init__.py +11 -0
- anysite/output/console.py +45 -0
- anysite/output/formatters.py +301 -0
- anysite/output/templates.py +76 -0
- anysite/py.typed +0 -0
- anysite/streaming/__init__.py +1 -0
- anysite/streaming/progress.py +121 -0
- anysite/streaming/writer.py +130 -0
- anysite/utils/__init__.py +1 -0
- anysite/utils/fields.py +242 -0
- anysite/utils/retry.py +109 -0
- anysite_cli-0.1.2.dist-info/METADATA +455 -0
- anysite_cli-0.1.2.dist-info/RECORD +64 -0
- anysite_cli-0.1.2.dist-info/WHEEL +4 -0
- anysite_cli-0.1.2.dist-info/entry_points.txt +2 -0
- anysite_cli-0.1.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Output formatters for different formats."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import io
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import orjson
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
from anysite.output.console import console
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class OutputFormat(str, Enum):
|
|
16
|
+
"""Supported output formats."""
|
|
17
|
+
|
|
18
|
+
JSON = "json"
|
|
19
|
+
JSONL = "jsonl"
|
|
20
|
+
CSV = "csv"
|
|
21
|
+
TABLE = "table"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def filter_fields(data: dict[str, Any], fields: list[str]) -> dict[str, Any]:
|
|
25
|
+
"""Filter dictionary to only include specified fields.
|
|
26
|
+
|
|
27
|
+
Supports nested fields with dot notation (e.g., 'experience.company').
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
data: Source dictionary
|
|
31
|
+
fields: List of field names to include
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Filtered dictionary
|
|
35
|
+
"""
|
|
36
|
+
if not fields:
|
|
37
|
+
return data
|
|
38
|
+
|
|
39
|
+
result: dict[str, Any] = {}
|
|
40
|
+
for field in fields:
|
|
41
|
+
if "." in field:
|
|
42
|
+
# Handle nested fields
|
|
43
|
+
parts = field.split(".")
|
|
44
|
+
value = data
|
|
45
|
+
for part in parts:
|
|
46
|
+
if isinstance(value, dict):
|
|
47
|
+
value = value.get(part)
|
|
48
|
+
elif isinstance(value, list) and part.isdigit():
|
|
49
|
+
idx = int(part)
|
|
50
|
+
value = value[idx] if idx < len(value) else None
|
|
51
|
+
else:
|
|
52
|
+
value = None
|
|
53
|
+
break
|
|
54
|
+
if value is not None:
|
|
55
|
+
# Store in nested structure
|
|
56
|
+
current = result
|
|
57
|
+
for part in parts[:-1]:
|
|
58
|
+
if part not in current:
|
|
59
|
+
current[part] = {}
|
|
60
|
+
current = current[part]
|
|
61
|
+
current[parts[-1]] = value
|
|
62
|
+
elif field in data:
|
|
63
|
+
result[field] = data[field]
|
|
64
|
+
|
|
65
|
+
return result
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def flatten_for_csv(data: dict[str, Any], prefix: str = "") -> dict[str, Any]:
|
|
69
|
+
"""Flatten nested dictionary for CSV output.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
data: Nested dictionary
|
|
73
|
+
prefix: Prefix for nested keys
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Flattened dictionary with dot-notation keys
|
|
77
|
+
"""
|
|
78
|
+
result: dict[str, Any] = {}
|
|
79
|
+
|
|
80
|
+
for key, value in data.items():
|
|
81
|
+
full_key = f"{prefix}.{key}" if prefix else key
|
|
82
|
+
|
|
83
|
+
if isinstance(value, dict):
|
|
84
|
+
result.update(flatten_for_csv(value, full_key))
|
|
85
|
+
elif isinstance(value, list):
|
|
86
|
+
if all(isinstance(item, (str, int, float, bool, type(None))) for item in value):
|
|
87
|
+
# Simple list - join as string
|
|
88
|
+
result[full_key] = "; ".join(str(v) for v in value if v is not None)
|
|
89
|
+
else:
|
|
90
|
+
# Complex list - take length or first few items
|
|
91
|
+
result[f"{full_key}_count"] = len(value)
|
|
92
|
+
for i, item in enumerate(value[:3]): # Max 3 items
|
|
93
|
+
if isinstance(item, dict):
|
|
94
|
+
result.update(flatten_for_csv(item, f"{full_key}_{i}"))
|
|
95
|
+
else:
|
|
96
|
+
result[f"{full_key}_{i}"] = item
|
|
97
|
+
else:
|
|
98
|
+
result[full_key] = value
|
|
99
|
+
|
|
100
|
+
return result
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def format_json(data: Any, indent: bool = True) -> str:
|
|
104
|
+
"""Format data as JSON.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
data: Data to format
|
|
108
|
+
indent: Whether to indent (pretty print)
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
JSON string
|
|
112
|
+
"""
|
|
113
|
+
option = orjson.OPT_INDENT_2 if indent else 0
|
|
114
|
+
return orjson.dumps(data, option=option).decode("utf-8")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def format_jsonl(data: list[dict[str, Any]]) -> str:
|
|
118
|
+
"""Format data as newline-delimited JSON (JSONL).
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
data: List of dictionaries
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
JSONL string with one JSON object per line
|
|
125
|
+
"""
|
|
126
|
+
lines = [orjson.dumps(item).decode("utf-8") for item in data]
|
|
127
|
+
return "\n".join(lines)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def format_csv_output(data: list[dict[str, Any]], fields: list[str] | None = None) -> str:
|
|
131
|
+
"""Format data as CSV.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
data: List of dictionaries
|
|
135
|
+
fields: Optional list of fields to include
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
CSV string with headers
|
|
139
|
+
"""
|
|
140
|
+
if not data:
|
|
141
|
+
return ""
|
|
142
|
+
|
|
143
|
+
# Flatten all records
|
|
144
|
+
flattened = [flatten_for_csv(item) for item in data]
|
|
145
|
+
|
|
146
|
+
# Get all unique keys for headers
|
|
147
|
+
if fields:
|
|
148
|
+
headers = fields
|
|
149
|
+
else:
|
|
150
|
+
headers = list(dict.fromkeys(key for item in flattened for key in item.keys()))
|
|
151
|
+
|
|
152
|
+
# Write CSV
|
|
153
|
+
output = io.StringIO()
|
|
154
|
+
writer = csv.DictWriter(output, fieldnames=headers, extrasaction="ignore")
|
|
155
|
+
writer.writeheader()
|
|
156
|
+
writer.writerows(flattened)
|
|
157
|
+
return output.getvalue()
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def format_table_output(data: list[dict[str, Any]], fields: list[str] | None = None) -> None:
|
|
161
|
+
"""Format data as a Rich table and print to console.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
data: List of dictionaries
|
|
165
|
+
fields: Optional list of fields to include
|
|
166
|
+
"""
|
|
167
|
+
if not data:
|
|
168
|
+
console.print("[dim]No results[/dim]")
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
# For single item, display vertically
|
|
172
|
+
if len(data) == 1:
|
|
173
|
+
table = Table(show_header=True, header_style="bold")
|
|
174
|
+
table.add_column("Field", style="cyan")
|
|
175
|
+
table.add_column("Value")
|
|
176
|
+
|
|
177
|
+
item = data[0]
|
|
178
|
+
if fields:
|
|
179
|
+
item = filter_fields(item, fields)
|
|
180
|
+
|
|
181
|
+
for key, value in item.items():
|
|
182
|
+
if isinstance(value, (dict, list)):
|
|
183
|
+
value = format_json(value, indent=False)
|
|
184
|
+
table.add_row(key, str(value) if value is not None else "[dim]null[/dim]")
|
|
185
|
+
|
|
186
|
+
console.print(table)
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
# For multiple items, display as grid
|
|
190
|
+
# Flatten for tabular display
|
|
191
|
+
flattened = [flatten_for_csv(item) for item in data]
|
|
192
|
+
|
|
193
|
+
if fields:
|
|
194
|
+
headers = fields
|
|
195
|
+
else:
|
|
196
|
+
# Select most important fields (limit columns for readability)
|
|
197
|
+
all_keys = list(dict.fromkeys(key for item in flattened for key in item.keys()))
|
|
198
|
+
# Prioritize common fields
|
|
199
|
+
priority_fields = ["name", "full_name", "headline", "title", "company", "url", "followers"]
|
|
200
|
+
headers = [f for f in priority_fields if f in all_keys]
|
|
201
|
+
headers.extend([f for f in all_keys if f not in headers][:10 - len(headers)])
|
|
202
|
+
|
|
203
|
+
# Create table
|
|
204
|
+
table = Table(show_header=True, header_style="bold")
|
|
205
|
+
for header in headers:
|
|
206
|
+
table.add_column(header)
|
|
207
|
+
|
|
208
|
+
for item in flattened:
|
|
209
|
+
row = []
|
|
210
|
+
for header in headers:
|
|
211
|
+
value = item.get(header, "")
|
|
212
|
+
if isinstance(value, (dict, list)):
|
|
213
|
+
value = "..."
|
|
214
|
+
row.append(str(value) if value is not None else "")
|
|
215
|
+
table.add_row(*row)
|
|
216
|
+
|
|
217
|
+
console.print(table)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def format_output(
|
|
221
|
+
data: Any,
|
|
222
|
+
output_format: OutputFormat,
|
|
223
|
+
fields: list[str] | None = None,
|
|
224
|
+
output_file: Path | None = None,
|
|
225
|
+
quiet: bool = False,
|
|
226
|
+
exclude: list[str] | None = None,
|
|
227
|
+
compact: bool = False,
|
|
228
|
+
append: bool = False,
|
|
229
|
+
) -> None:
|
|
230
|
+
"""Format and output data in the specified format.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
data: Data to format (usually list of dicts from API)
|
|
234
|
+
output_format: Output format (json, jsonl, csv, table)
|
|
235
|
+
fields: Optional list of fields to include
|
|
236
|
+
output_file: Optional file path to write output
|
|
237
|
+
quiet: Suppress non-data output
|
|
238
|
+
exclude: Optional list of fields to exclude
|
|
239
|
+
compact: Use compact output (no indentation)
|
|
240
|
+
append: Append to existing file
|
|
241
|
+
"""
|
|
242
|
+
# Ensure data is a list
|
|
243
|
+
if not isinstance(data, list):
|
|
244
|
+
data = [data]
|
|
245
|
+
|
|
246
|
+
# Filter fields if specified
|
|
247
|
+
if fields:
|
|
248
|
+
data = [filter_fields(item, fields) for item in data]
|
|
249
|
+
|
|
250
|
+
# Exclude fields if specified
|
|
251
|
+
if exclude:
|
|
252
|
+
from anysite.utils.fields import exclude_fields
|
|
253
|
+
data = [exclude_fields(item, exclude) for item in data]
|
|
254
|
+
|
|
255
|
+
# Format based on type
|
|
256
|
+
if output_format == OutputFormat.TABLE:
|
|
257
|
+
if output_file:
|
|
258
|
+
# Table can't be written to file, fall back to JSON
|
|
259
|
+
formatted = format_json(data, indent=not compact)
|
|
260
|
+
_write_output(formatted, output_file, quiet, append=append)
|
|
261
|
+
else:
|
|
262
|
+
format_table_output(data, fields)
|
|
263
|
+
return
|
|
264
|
+
|
|
265
|
+
if output_format == OutputFormat.JSON:
|
|
266
|
+
formatted = format_json(data, indent=not compact)
|
|
267
|
+
elif output_format == OutputFormat.JSONL:
|
|
268
|
+
formatted = format_jsonl(data)
|
|
269
|
+
elif output_format == OutputFormat.CSV:
|
|
270
|
+
formatted = format_csv_output(data, fields)
|
|
271
|
+
else:
|
|
272
|
+
formatted = format_json(data, indent=not compact)
|
|
273
|
+
|
|
274
|
+
_write_output(formatted, output_file, quiet, append=append)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _write_output(
|
|
278
|
+
content: str,
|
|
279
|
+
output_file: Path | None,
|
|
280
|
+
quiet: bool,
|
|
281
|
+
append: bool = False,
|
|
282
|
+
) -> None:
|
|
283
|
+
"""Write content to file or stdout.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
content: Content to write
|
|
287
|
+
output_file: Optional file path
|
|
288
|
+
quiet: Suppress messages
|
|
289
|
+
append: Append to existing file
|
|
290
|
+
"""
|
|
291
|
+
if output_file:
|
|
292
|
+
mode = "a" if append else "w"
|
|
293
|
+
with open(output_file, mode, encoding="utf-8") as f:
|
|
294
|
+
f.write(content)
|
|
295
|
+
if not quiet:
|
|
296
|
+
from anysite.output.console import print_success
|
|
297
|
+
|
|
298
|
+
action = "appended to" if append else "saved to"
|
|
299
|
+
print_success(f"Output {action} {output_file}")
|
|
300
|
+
else:
|
|
301
|
+
print(content)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Filename template resolution for batch output."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class FilenameTemplate:
|
|
8
|
+
"""Resolve filename templates with variable substitution.
|
|
9
|
+
|
|
10
|
+
Supported variables:
|
|
11
|
+
{id} - Record ID or input value
|
|
12
|
+
{username} - Username field from record
|
|
13
|
+
{date} - Current date (YYYY-MM-DD)
|
|
14
|
+
{datetime} - Current date and time (YYYY-MM-DD_HH-MM-SS)
|
|
15
|
+
{timestamp} - Unix timestamp
|
|
16
|
+
{index} - Zero-padded index
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, template: str, extension: str = ".json") -> None:
|
|
20
|
+
"""Initialize template.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
template: Template string with {variable} placeholders
|
|
24
|
+
extension: File extension to append
|
|
25
|
+
"""
|
|
26
|
+
self.template = template
|
|
27
|
+
self.extension = extension
|
|
28
|
+
|
|
29
|
+
def resolve(
|
|
30
|
+
self,
|
|
31
|
+
record: dict[str, Any] | None = None,
|
|
32
|
+
index: int = 0,
|
|
33
|
+
input_value: str = "",
|
|
34
|
+
) -> str:
|
|
35
|
+
"""Resolve template variables to an actual filename.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
record: Data record (for extracting fields)
|
|
39
|
+
index: Item index in batch
|
|
40
|
+
input_value: Original input value
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Resolved filename string with extension
|
|
44
|
+
"""
|
|
45
|
+
now = datetime.now()
|
|
46
|
+
record = record or {}
|
|
47
|
+
|
|
48
|
+
variables = {
|
|
49
|
+
"id": input_value or record.get("id", record.get("urn", str(index))),
|
|
50
|
+
"username": record.get("username", record.get("user", input_value)),
|
|
51
|
+
"date": now.strftime("%Y-%m-%d"),
|
|
52
|
+
"datetime": now.strftime("%Y-%m-%d_%H-%M-%S"),
|
|
53
|
+
"timestamp": str(int(now.timestamp())),
|
|
54
|
+
"index": f"{index:04d}",
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
filename = self.template
|
|
58
|
+
for key, value in variables.items():
|
|
59
|
+
filename = filename.replace(f"{{{key}}}", str(value))
|
|
60
|
+
|
|
61
|
+
# Sanitize filename
|
|
62
|
+
filename = self._sanitize(filename)
|
|
63
|
+
|
|
64
|
+
# Add extension if not present
|
|
65
|
+
if not any(filename.endswith(ext) for ext in [".json", ".jsonl", ".csv"]):
|
|
66
|
+
filename += self.extension
|
|
67
|
+
|
|
68
|
+
return filename
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def _sanitize(filename: str) -> str:
|
|
72
|
+
"""Remove or replace unsafe characters from filename."""
|
|
73
|
+
unsafe = '<>:"/\\|?*'
|
|
74
|
+
for char in unsafe:
|
|
75
|
+
filename = filename.replace(char, "_")
|
|
76
|
+
return filename.strip(". ")
|
anysite/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Streaming output modules."""
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Progress tracking with Rich progress bars."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
from rich.progress import (
|
|
7
|
+
BarColumn,
|
|
8
|
+
MofNCompleteColumn,
|
|
9
|
+
Progress,
|
|
10
|
+
SpinnerColumn,
|
|
11
|
+
TextColumn,
|
|
12
|
+
TimeElapsedColumn,
|
|
13
|
+
TimeRemainingColumn,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ProgressTracker:
|
|
18
|
+
"""Rich progress bar wrapper for tracking long operations.
|
|
19
|
+
|
|
20
|
+
Automatically shows/hides based on terminal detection and quiet mode.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
total: int | None = None,
|
|
26
|
+
description: str = "Processing...",
|
|
27
|
+
show: bool | None = None,
|
|
28
|
+
quiet: bool = False,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""Initialize progress tracker.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
total: Total number of items (None for indeterminate)
|
|
34
|
+
description: Description text
|
|
35
|
+
show: Force show/hide (None = auto-detect)
|
|
36
|
+
quiet: Suppress all output
|
|
37
|
+
"""
|
|
38
|
+
self.total = total
|
|
39
|
+
self.description = description
|
|
40
|
+
self.quiet = quiet
|
|
41
|
+
self._completed = 0
|
|
42
|
+
self._start_time = time.monotonic()
|
|
43
|
+
self._progress: Progress | None = None
|
|
44
|
+
|
|
45
|
+
# Determine if we should show progress
|
|
46
|
+
if quiet:
|
|
47
|
+
self._should_show = False
|
|
48
|
+
elif show is not None:
|
|
49
|
+
self._should_show = show
|
|
50
|
+
else:
|
|
51
|
+
# Auto: show if stdout is a terminal
|
|
52
|
+
self._should_show = sys.stderr.isatty()
|
|
53
|
+
|
|
54
|
+
def start(self) -> None:
|
|
55
|
+
"""Start the progress bar."""
|
|
56
|
+
if not self._should_show:
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
columns = [
|
|
60
|
+
SpinnerColumn(),
|
|
61
|
+
TextColumn("[bold blue]{task.description}"),
|
|
62
|
+
BarColumn(),
|
|
63
|
+
MofNCompleteColumn(),
|
|
64
|
+
TimeElapsedColumn(),
|
|
65
|
+
TimeRemainingColumn(),
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
self._progress = Progress(*columns, transient=True)
|
|
69
|
+
self._progress.start()
|
|
70
|
+
self._task_id = self._progress.add_task(
|
|
71
|
+
self.description,
|
|
72
|
+
total=self.total,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def update(self, n: int = 1) -> None:
|
|
76
|
+
"""Update progress by n items.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
n: Number of items completed
|
|
80
|
+
"""
|
|
81
|
+
self._completed += n
|
|
82
|
+
if self._progress is not None:
|
|
83
|
+
self._progress.update(self._task_id, advance=n)
|
|
84
|
+
|
|
85
|
+
def set_status(self, text: str) -> None:
|
|
86
|
+
"""Update the description text.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
text: New description
|
|
90
|
+
"""
|
|
91
|
+
self.description = text
|
|
92
|
+
if self._progress is not None:
|
|
93
|
+
self._progress.update(self._task_id, description=text)
|
|
94
|
+
|
|
95
|
+
def finish(self) -> None:
|
|
96
|
+
"""Complete the progress bar."""
|
|
97
|
+
if self._progress is not None:
|
|
98
|
+
self._progress.stop()
|
|
99
|
+
self._progress = None
|
|
100
|
+
|
|
101
|
+
def get_stats(self) -> dict[str, float | int]:
|
|
102
|
+
"""Get execution statistics.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Dictionary with timing and throughput stats
|
|
106
|
+
"""
|
|
107
|
+
elapsed = time.monotonic() - self._start_time
|
|
108
|
+
rate = self._completed / elapsed if elapsed > 0 else 0
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
"total": self._completed,
|
|
112
|
+
"elapsed_seconds": round(elapsed, 2),
|
|
113
|
+
"records_per_second": round(rate, 1),
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
def __enter__(self) -> "ProgressTracker":
|
|
117
|
+
self.start()
|
|
118
|
+
return self
|
|
119
|
+
|
|
120
|
+
def __exit__(self, *args: object) -> None:
|
|
121
|
+
self.finish()
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Streaming record writer for outputting records one at a time."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import IO, Any
|
|
6
|
+
|
|
7
|
+
import orjson
|
|
8
|
+
|
|
9
|
+
from anysite.output.formatters import OutputFormat
|
|
10
|
+
from anysite.utils.fields import exclude_fields, filter_fields
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StreamingWriter:
|
|
14
|
+
"""Write records one at a time to stdout or a file.
|
|
15
|
+
|
|
16
|
+
Supports JSONL (primary) and CSV streaming output.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
output: Path | None = None,
|
|
22
|
+
format: OutputFormat = OutputFormat.JSONL,
|
|
23
|
+
fields: list[str] | None = None,
|
|
24
|
+
exclude: list[str] | None = None,
|
|
25
|
+
compact: bool = False,
|
|
26
|
+
append: bool = False,
|
|
27
|
+
) -> None:
|
|
28
|
+
"""Initialize streaming writer.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
output: Output file path (None = stdout)
|
|
32
|
+
format: Output format (JSONL or CSV)
|
|
33
|
+
fields: Fields to include
|
|
34
|
+
exclude: Fields to exclude
|
|
35
|
+
compact: Compact JSON output
|
|
36
|
+
append: Append to existing file
|
|
37
|
+
"""
|
|
38
|
+
self.output = output
|
|
39
|
+
self.format = format
|
|
40
|
+
self.fields = fields
|
|
41
|
+
self.exclude = exclude
|
|
42
|
+
self.compact = compact
|
|
43
|
+
self.append = append
|
|
44
|
+
self._file: IO[str] | None = None
|
|
45
|
+
self._csv_headers_written = False
|
|
46
|
+
self._count = 0
|
|
47
|
+
|
|
48
|
+
def _get_writer(self) -> IO[str]:
|
|
49
|
+
"""Get or create the output writer."""
|
|
50
|
+
if self._file is not None:
|
|
51
|
+
return self._file
|
|
52
|
+
|
|
53
|
+
if self.output:
|
|
54
|
+
mode = "a" if self.append else "w"
|
|
55
|
+
self._file = open(self.output, mode, encoding="utf-8") # noqa: SIM115
|
|
56
|
+
return self._file
|
|
57
|
+
else:
|
|
58
|
+
return sys.stdout
|
|
59
|
+
|
|
60
|
+
def _process_record(self, record: dict[str, Any]) -> dict[str, Any]:
|
|
61
|
+
"""Apply field filtering to a record."""
|
|
62
|
+
if self.fields:
|
|
63
|
+
record = filter_fields(record, self.fields)
|
|
64
|
+
if self.exclude:
|
|
65
|
+
record = exclude_fields(record, self.exclude)
|
|
66
|
+
return record
|
|
67
|
+
|
|
68
|
+
def write(self, record: dict[str, Any]) -> None:
|
|
69
|
+
"""Write a single record.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
record: Dictionary to write
|
|
73
|
+
"""
|
|
74
|
+
record = self._process_record(record)
|
|
75
|
+
writer = self._get_writer()
|
|
76
|
+
|
|
77
|
+
if self.format == OutputFormat.JSONL:
|
|
78
|
+
line = orjson.dumps(record).decode("utf-8")
|
|
79
|
+
writer.write(line + "\n")
|
|
80
|
+
writer.flush()
|
|
81
|
+
|
|
82
|
+
elif self.format == OutputFormat.CSV:
|
|
83
|
+
import csv
|
|
84
|
+
import io
|
|
85
|
+
|
|
86
|
+
if not self._csv_headers_written:
|
|
87
|
+
# Flatten and write header
|
|
88
|
+
from anysite.output.formatters import flatten_for_csv
|
|
89
|
+
flat = flatten_for_csv(record)
|
|
90
|
+
self._csv_fieldnames = list(flat.keys())
|
|
91
|
+
output = io.StringIO()
|
|
92
|
+
csv_writer = csv.DictWriter(output, fieldnames=self._csv_fieldnames, extrasaction="ignore")
|
|
93
|
+
csv_writer.writeheader()
|
|
94
|
+
csv_writer.writerow(flat)
|
|
95
|
+
writer.write(output.getvalue())
|
|
96
|
+
self._csv_headers_written = True
|
|
97
|
+
else:
|
|
98
|
+
from anysite.output.formatters import flatten_for_csv
|
|
99
|
+
flat = flatten_for_csv(record)
|
|
100
|
+
output = io.StringIO()
|
|
101
|
+
csv_writer = csv.DictWriter(output, fieldnames=self._csv_fieldnames, extrasaction="ignore")
|
|
102
|
+
csv_writer.writerow(flat)
|
|
103
|
+
writer.write(output.getvalue())
|
|
104
|
+
|
|
105
|
+
writer.flush()
|
|
106
|
+
|
|
107
|
+
else:
|
|
108
|
+
# Default to JSONL for streaming
|
|
109
|
+
line = orjson.dumps(record).decode("utf-8")
|
|
110
|
+
writer.write(line + "\n")
|
|
111
|
+
writer.flush()
|
|
112
|
+
|
|
113
|
+
self._count += 1
|
|
114
|
+
|
|
115
|
+
def close(self) -> None:
|
|
116
|
+
"""Close the output file if opened."""
|
|
117
|
+
if self._file is not None:
|
|
118
|
+
self._file.close()
|
|
119
|
+
self._file = None
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def count(self) -> int:
|
|
123
|
+
"""Number of records written."""
|
|
124
|
+
return self._count
|
|
125
|
+
|
|
126
|
+
def __enter__(self) -> "StreamingWriter":
|
|
127
|
+
return self
|
|
128
|
+
|
|
129
|
+
def __exit__(self, *args: object) -> None:
|
|
130
|
+
self.close()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utility modules."""
|