filedge 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- filedge/__init__.py +0 -0
- filedge/cdc.py +49 -0
- filedge/cli.py +526 -0
- filedge/column_types.py +79 -0
- filedge/compactor.py +158 -0
- filedge/config.py +102 -0
- filedge/connectors/__init__.py +110 -0
- filedge/connectors/bigquery.py +309 -0
- filedge/connectors/databricks.py +527 -0
- filedge/connectors/duckdb.py +181 -0
- filedge/connectors/postgres.py +163 -0
- filedge/connectors/sqlite.py +149 -0
- filedge/db.py +297 -0
- filedge/filesystem.py +73 -0
- filedge/hashing.py +11 -0
- filedge/health.py +104 -0
- filedge/inferrer.py +135 -0
- filedge/inspect_formatter.py +43 -0
- filedge/loader.py +66 -0
- filedge/log.py +84 -0
- filedge/parser.py +69 -0
- filedge/pipeline.py +151 -0
- filedge/preview_formatter.py +63 -0
- filedge/progress.py +232 -0
- filedge/schema.py +84 -0
- filedge/tracing.py +131 -0
- filedge/transform.py +31 -0
- filedge/validate_formatter.py +33 -0
- filedge/validator.py +51 -0
- filedge-0.1.0.dist-info/METADATA +40 -0
- filedge-0.1.0.dist-info/RECORD +34 -0
- filedge-0.1.0.dist-info/WHEEL +4 -0
- filedge-0.1.0.dist-info/entry_points.txt +2 -0
- filedge-0.1.0.dist-info/licenses/LICENSE +201 -0
filedge/__init__.py
ADDED
|
File without changes
|
filedge/cdc.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Iterable
|
|
3
|
+
|
|
4
|
+
from filedge.config import CdcConfig
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CdcError(Exception):
|
|
8
|
+
"""Raised when CDC rows cannot be applied safely."""
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class CdcChange:
|
|
13
|
+
operation: str
|
|
14
|
+
key: tuple
|
|
15
|
+
row: dict
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def plan_cdc_changes(rows: Iterable[dict], cdc: CdcConfig) -> list[CdcChange]:
|
|
19
|
+
operation_by_value = {
|
|
20
|
+
value: operation
|
|
21
|
+
for operation, values in cdc.operations.items()
|
|
22
|
+
for value in values
|
|
23
|
+
}
|
|
24
|
+
latest_by_key = {}
|
|
25
|
+
|
|
26
|
+
for row in rows:
|
|
27
|
+
raw_operation = row.get(cdc.operation_column)
|
|
28
|
+
operation = operation_by_value.get(raw_operation)
|
|
29
|
+
if operation is None:
|
|
30
|
+
raise CdcError(f"Unknown CDC operation: {raw_operation!r}")
|
|
31
|
+
|
|
32
|
+
key = tuple(row.get(column) for column in cdc.keys)
|
|
33
|
+
if any(value is None for value in key):
|
|
34
|
+
raise CdcError("CDC key columns cannot be null")
|
|
35
|
+
|
|
36
|
+
sequence = row.get(cdc.sequence_by)
|
|
37
|
+
if sequence is None:
|
|
38
|
+
raise CdcError(f"CDC sequence column {cdc.sequence_by!r} cannot be null")
|
|
39
|
+
|
|
40
|
+
existing = latest_by_key.get(key)
|
|
41
|
+
if existing is not None and existing[0] == sequence:
|
|
42
|
+
raise CdcError(f"Multiple CDC rows for key {key!r} have the same sequence")
|
|
43
|
+
if existing is None or sequence > existing[0]:
|
|
44
|
+
latest_by_key[key] = (sequence, operation, dict(row))
|
|
45
|
+
|
|
46
|
+
return [
|
|
47
|
+
CdcChange(operation=operation, key=key, row=row)
|
|
48
|
+
for key, (_sequence, operation, row) in latest_by_key.items()
|
|
49
|
+
]
|
filedge/cli.py
ADDED
|
@@ -0,0 +1,526 @@
|
|
|
1
|
+
import json as json_lib
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
|
|
7
|
+
from filedge.compactor import compact as run_compact
|
|
8
|
+
from filedge.connectors import SchemaError
|
|
9
|
+
from filedge.db import (
|
|
10
|
+
Database,
|
|
11
|
+
create_audit_tables,
|
|
12
|
+
find_file_by_hash,
|
|
13
|
+
find_terminal_failed_by_filename,
|
|
14
|
+
get_status_summary,
|
|
15
|
+
list_terminal_failed,
|
|
16
|
+
requeue_all_terminal_failed,
|
|
17
|
+
requeue_by_hash,
|
|
18
|
+
)
|
|
19
|
+
from filedge.filesystem import get_filesystem, open_file
|
|
20
|
+
from filedge.config import load_config
|
|
21
|
+
from filedge.health import HealthcheckError
|
|
22
|
+
from filedge.inferrer import infer_schema, infer_schema_from_parquet
|
|
23
|
+
from filedge.inspect_formatter import format_summary, format_yaml
|
|
24
|
+
from filedge.parser import get_parser
|
|
25
|
+
from filedge.preview_formatter import format_preview
|
|
26
|
+
from filedge.progress import RichPipelineProgress
|
|
27
|
+
from filedge.validate_formatter import format_json, format_text
|
|
28
|
+
from filedge.validator import validate_file
|
|
29
|
+
from filedge.pipeline import run_pipeline
|
|
30
|
+
|
|
31
|
+
_EXT_TO_FORMAT = {
|
|
32
|
+
".csv": "csv",
|
|
33
|
+
".ndjson": "ndjson",
|
|
34
|
+
".jsonl": "ndjson",
|
|
35
|
+
".parquet": "parquet",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
_FORMAT_CHOICE = click.Choice(["csv", "ndjson", "parquet"])
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@click.group()
|
|
42
|
+
def cli():
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@cli.command()
|
|
47
|
+
@click.option("--dir", "watched_dir", required=True,
|
|
48
|
+
type=click.Path(exists=True, file_okay=False, dir_okay=True),
|
|
49
|
+
help="Watched directory path")
|
|
50
|
+
@click.option("--config", "config_path", required=True,
|
|
51
|
+
type=click.Path(exists=True, dir_okay=False),
|
|
52
|
+
help="Path to pipeline.yaml")
|
|
53
|
+
@click.option("--audit-db-url", required=True, envvar="FILEDGE_AUDIT_DB_URL", help="Audit database URL")
|
|
54
|
+
@click.option(
|
|
55
|
+
"--progress/--no-progress",
|
|
56
|
+
"show_progress",
|
|
57
|
+
default=None,
|
|
58
|
+
help="Show live progress bars. Defaults to on for interactive terminals.",
|
|
59
|
+
)
|
|
60
|
+
@click.option("--json", "output_json", is_flag=True,
|
|
61
|
+
help="Write the Run summary as a single JSON line to stdout. Exit non-zero if any file failed.")
|
|
62
|
+
@click.option("--log-format", "log_format", type=click.Choice(["json", "text"]), default=None,
|
|
63
|
+
help="Log output format. Defaults to text on a TTY, json otherwise.")
|
|
64
|
+
@click.option("--log-level", "log_level", default="INFO", show_default=True,
|
|
65
|
+
help="Log level (DEBUG, INFO, WARNING, ERROR).")
|
|
66
|
+
@click.option("--otel-traces/--no-otel-traces", "otel_traces", default=None,
|
|
67
|
+
help="Enable OpenTelemetry tracing. Off by default. Also enabled by FILEDGE_OTEL_TRACES=true.")
|
|
68
|
+
@click.option("--otel-logs/--no-otel-logs", "otel_logs", default=None,
|
|
69
|
+
help="Export filedge logs through OpenTelemetry. Off by default. Also enabled by FILEDGE_OTEL_LOGS=true.")
|
|
70
|
+
def run(
|
|
71
|
+
watched_dir,
|
|
72
|
+
config_path,
|
|
73
|
+
audit_db_url,
|
|
74
|
+
show_progress,
|
|
75
|
+
output_json,
|
|
76
|
+
log_format,
|
|
77
|
+
log_level,
|
|
78
|
+
otel_traces,
|
|
79
|
+
otel_logs,
|
|
80
|
+
):
|
|
81
|
+
"""Run the ETL pipeline for a Watched Directory."""
|
|
82
|
+
from filedge.log import configure_logging, get_logger
|
|
83
|
+
from filedge.progress import LoggingProgressReporter
|
|
84
|
+
from filedge.tracing import (
|
|
85
|
+
configure_otel_logs,
|
|
86
|
+
configure_tracing,
|
|
87
|
+
should_enable_logs,
|
|
88
|
+
should_enable_tracing,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
is_tty = sys.stderr.isatty()
|
|
93
|
+
if show_progress is None:
|
|
94
|
+
show_progress = is_tty
|
|
95
|
+
if log_format is None:
|
|
96
|
+
log_format = "text" if is_tty else "json"
|
|
97
|
+
|
|
98
|
+
configure_logging(level=log_level, fmt=log_format)
|
|
99
|
+
|
|
100
|
+
tracing_on = should_enable_tracing(
|
|
101
|
+
cli_flag=otel_traces,
|
|
102
|
+
env_value=os.environ.get("FILEDGE_OTEL_TRACES"),
|
|
103
|
+
)
|
|
104
|
+
configure_tracing(enabled=tracing_on)
|
|
105
|
+
logs_on = should_enable_logs(
|
|
106
|
+
cli_flag=otel_logs,
|
|
107
|
+
env_value=os.environ.get("FILEDGE_OTEL_LOGS"),
|
|
108
|
+
)
|
|
109
|
+
configure_otel_logs(enabled=logs_on)
|
|
110
|
+
|
|
111
|
+
run_id = _new_run_id()
|
|
112
|
+
log_reporter = LoggingProgressReporter(get_logger("filedge.pipeline"), run_id=run_id)
|
|
113
|
+
|
|
114
|
+
from contextlib import ExitStack
|
|
115
|
+
with ExitStack() as stack:
|
|
116
|
+
handlers = [log_reporter.handle]
|
|
117
|
+
|
|
118
|
+
tracing_reporter = None
|
|
119
|
+
if tracing_on:
|
|
120
|
+
from filedge.progress import TracingProgressReporter
|
|
121
|
+
tracing_reporter = stack.enter_context(TracingProgressReporter(run_id=run_id))
|
|
122
|
+
handlers.append(tracing_reporter.handle)
|
|
123
|
+
|
|
124
|
+
if show_progress:
|
|
125
|
+
from rich.console import Console
|
|
126
|
+
rich_progress = stack.enter_context(RichPipelineProgress(Console(stderr=True)))
|
|
127
|
+
handlers.insert(0, rich_progress.handle)
|
|
128
|
+
|
|
129
|
+
result = run_pipeline(
|
|
130
|
+
watched_dir, config_path, audit_db_url,
|
|
131
|
+
progress=_tee(*handlers), run_id=run_id,
|
|
132
|
+
)
|
|
133
|
+
if tracing_reporter is not None:
|
|
134
|
+
tracing_reporter.set_run_attributes(result)
|
|
135
|
+
|
|
136
|
+
if output_json:
|
|
137
|
+
click.echo(json_lib.dumps(result))
|
|
138
|
+
else:
|
|
139
|
+
click.echo(
|
|
140
|
+
f"Committed: {result['committed']} "
|
|
141
|
+
f"Failed: {result['failed']} "
|
|
142
|
+
f"Skipped: {result['skipped']} "
|
|
143
|
+
f"New: {result['new_files']} "
|
|
144
|
+
f"Reclaimed: {result['reclaimed']} "
|
|
145
|
+
f"Retried: {result['retried']}"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if result["failed"] > 0:
|
|
149
|
+
sys.exit(1)
|
|
150
|
+
except SchemaError as e:
|
|
151
|
+
click.echo(f"Schema error: {e}", err=True)
|
|
152
|
+
sys.exit(1)
|
|
153
|
+
except HealthcheckError as e:
|
|
154
|
+
click.echo(str(e), err=True)
|
|
155
|
+
sys.exit(1)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
click.echo(f"Error: {e}", err=True)
|
|
158
|
+
sys.exit(1)
|
|
159
|
+
|
|
160
|
+
def _new_run_id() -> str:
|
|
161
|
+
import uuid
|
|
162
|
+
return str(uuid.uuid4())
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _tee(*handlers):
|
|
166
|
+
def fanout(event):
|
|
167
|
+
for h in handlers:
|
|
168
|
+
h(event)
|
|
169
|
+
return fanout
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@cli.command()
|
|
173
|
+
@click.option("--config", "config_path", required=True,
|
|
174
|
+
type=click.Path(exists=True, dir_okay=False),
|
|
175
|
+
help="Path to pipeline.yaml")
|
|
176
|
+
@click.option("--audit-db-url", required=True, envvar="FILEDGE_AUDIT_DB_URL", help="Audit database URL")
|
|
177
|
+
@click.option("--json", "output_json", is_flag=True,
|
|
178
|
+
help="Write health status as a JSON object to stdout.")
|
|
179
|
+
def healthcheck(config_path, audit_db_url, output_json):
|
|
180
|
+
"""Probe the Audit DB and destination connector without writing data."""
|
|
181
|
+
from filedge.health import run_healthchecks
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
report = run_healthchecks(load_config(config_path), audit_db_url)
|
|
185
|
+
except Exception as e:
|
|
186
|
+
click.echo(f"Healthcheck failed: configuration unreachable: {e}", err=True)
|
|
187
|
+
sys.exit(1)
|
|
188
|
+
|
|
189
|
+
if output_json:
|
|
190
|
+
click.echo(json_lib.dumps(report))
|
|
191
|
+
else:
|
|
192
|
+
for check in report["checks"]:
|
|
193
|
+
if check["ok"]:
|
|
194
|
+
click.echo(f"{check['name']}: ok ({check['latency_ms']} ms)")
|
|
195
|
+
else:
|
|
196
|
+
click.echo(
|
|
197
|
+
f"{check['name']}: unreachable: {check['error']}",
|
|
198
|
+
err=True,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if not report["healthy"]:
|
|
202
|
+
sys.exit(1)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@cli.command()
|
|
206
|
+
@click.option("--watched-dir", required=True,
|
|
207
|
+
type=click.Path(exists=True, file_okay=False, dir_okay=True),
|
|
208
|
+
help="Source prefix containing small files")
|
|
209
|
+
@click.option("--output", required=True, help="Output prefix for compacted files")
|
|
210
|
+
@click.option("--max-files", default=1000, show_default=True, help="Max input files per output file")
|
|
211
|
+
@click.option("--compress", is_flag=True, help="Gzip-compress output (.ndjson.gz)")
|
|
212
|
+
@click.option("--delete-source", is_flag=True,
|
|
213
|
+
help="Delete source files after each batch commits (requires delete permission).")
|
|
214
|
+
def compact(watched_dir, output, max_files, compress, delete_source):
|
|
215
|
+
"""Merge small NDJSON files into fewer larger files before ingestion."""
|
|
216
|
+
try:
|
|
217
|
+
result = run_compact(watched_dir, output, max_files=max_files, compress=compress,
|
|
218
|
+
delete_source=delete_source)
|
|
219
|
+
click.echo(
|
|
220
|
+
f"Batches written: {result['batches']} "
|
|
221
|
+
f"Files compacted: {result['files_compacted']}"
|
|
222
|
+
)
|
|
223
|
+
except Exception as e:
|
|
224
|
+
click.echo(f"Error: {e}", err=True)
|
|
225
|
+
sys.exit(1)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
@cli.command()
|
|
229
|
+
@click.option("--audit-db-url", required=True, envvar="FILEDGE_AUDIT_DB_URL", help="Audit database URL")
|
|
230
|
+
@click.option("--json", "output_json", is_flag=True, help="Output as JSON")
|
|
231
|
+
def status(audit_db_url, output_json):
|
|
232
|
+
"""Show pipeline status summary."""
|
|
233
|
+
db = Database(audit_db_url)
|
|
234
|
+
create_audit_tables(db)
|
|
235
|
+
summary = get_status_summary(db)
|
|
236
|
+
db.close()
|
|
237
|
+
|
|
238
|
+
if output_json:
|
|
239
|
+
click.echo(json_lib.dumps(summary, indent=2))
|
|
240
|
+
else:
|
|
241
|
+
click.echo(f"PENDING: {summary['PENDING']}")
|
|
242
|
+
click.echo(f"PROCESSING: {summary['PROCESSING']}")
|
|
243
|
+
click.echo(f"COMMITTED: {summary['COMMITTED']}")
|
|
244
|
+
click.echo(f"FAILED: {summary['FAILED']}")
|
|
245
|
+
if summary["recent_failures"]:
|
|
246
|
+
click.echo("\nRecent failures:")
|
|
247
|
+
for f in summary["recent_failures"]:
|
|
248
|
+
click.echo(f" {f['filename']}: {f['error_message']}")
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@cli.command()
|
|
252
|
+
@click.argument("file")
|
|
253
|
+
@click.option("--format", "fmt", default=None, type=_FORMAT_CHOICE,
|
|
254
|
+
help="File format (auto-detected from extension)")
|
|
255
|
+
@click.option("--sample-rows", default=1000, show_default=True, help="Number of rows to sample")
|
|
256
|
+
@click.option("--output", "output_path", default=None,
|
|
257
|
+
type=click.Path(dir_okay=False),
|
|
258
|
+
help="Write YAML block to this file instead of stdout")
|
|
259
|
+
@click.option("--encoding", default="utf-8", show_default=True, help="File encoding (e.g. utf-8, cp500, latin-1)")
|
|
260
|
+
def inspect(file, fmt, sample_rows, output_path, encoding):
|
|
261
|
+
"""Infer schema from a file and output a columns: block for pipeline.yaml."""
|
|
262
|
+
if fmt is None:
|
|
263
|
+
_, ext = os.path.splitext(file)
|
|
264
|
+
fmt = _EXT_TO_FORMAT.get(ext.lower())
|
|
265
|
+
if fmt is None:
|
|
266
|
+
click.echo(
|
|
267
|
+
f"Error: cannot detect format for {file!r}. "
|
|
268
|
+
f"Use --format csv or --format ndjson.",
|
|
269
|
+
err=True,
|
|
270
|
+
)
|
|
271
|
+
sys.exit(1)
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
fs, path = get_filesystem(file)
|
|
275
|
+
if fmt == "parquet":
|
|
276
|
+
import pyarrow.parquet as pq
|
|
277
|
+
with open_file(path, fs=fs, mode="rb") as f:
|
|
278
|
+
columns = infer_schema_from_parquet(pq.ParquetFile(f).schema_arrow)
|
|
279
|
+
else:
|
|
280
|
+
parser = get_parser(fmt)
|
|
281
|
+
with open_file(path, fs=fs, encoding=encoding) as f:
|
|
282
|
+
columns = infer_schema(parser.parse(f), sample_rows=sample_rows)
|
|
283
|
+
except Exception as e:
|
|
284
|
+
click.echo(f"Error: {e}", err=True)
|
|
285
|
+
sys.exit(1)
|
|
286
|
+
|
|
287
|
+
yaml_block = format_yaml(columns, source_path=file, sample_rows=sample_rows)
|
|
288
|
+
summary = format_summary(columns)
|
|
289
|
+
|
|
290
|
+
click.echo(summary, err=True)
|
|
291
|
+
|
|
292
|
+
if output_path:
|
|
293
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
294
|
+
f.write(yaml_block)
|
|
295
|
+
else:
|
|
296
|
+
click.echo(yaml_block, nl=False)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
@cli.command()
|
|
300
|
+
@click.argument("file")
|
|
301
|
+
@click.option("--format", "fmt", default=None, type=_FORMAT_CHOICE,
|
|
302
|
+
help="File format (auto-detected from extension)")
|
|
303
|
+
@click.option("--rows", "num_rows", default=10, show_default=True, help="Number of rows to display")
|
|
304
|
+
@click.option("--start-row", "start_row", default=1, show_default=True, help="First row to display (1-indexed)")
|
|
305
|
+
@click.option("--encoding", default="utf-8", show_default=True, help="File encoding (e.g. utf-8, cp500, latin-1)")
|
|
306
|
+
def preview(file, fmt, num_rows, start_row, encoding):
|
|
307
|
+
"""Show N rows of a file as a formatted table, optionally starting at a given row."""
|
|
308
|
+
if fmt is None:
|
|
309
|
+
_, ext = os.path.splitext(file)
|
|
310
|
+
fmt = _EXT_TO_FORMAT.get(ext.lower())
|
|
311
|
+
if fmt is None:
|
|
312
|
+
click.echo(
|
|
313
|
+
f"Error: cannot detect format for {file!r}. "
|
|
314
|
+
f"Use --format csv or --format ndjson.",
|
|
315
|
+
err=True,
|
|
316
|
+
)
|
|
317
|
+
sys.exit(2)
|
|
318
|
+
|
|
319
|
+
try:
|
|
320
|
+
from itertools import islice
|
|
321
|
+
fs, path = get_filesystem(file)
|
|
322
|
+
parser = get_parser(fmt)
|
|
323
|
+
with open_file(path, fs=fs, mode=parser.mode, encoding=encoding) as f:
|
|
324
|
+
rows = list(islice(parser.parse(f), start_row - 1, start_row - 1 + num_rows))
|
|
325
|
+
except Exception as e:
|
|
326
|
+
click.echo(f"Error: {e}", err=True)
|
|
327
|
+
sys.exit(2)
|
|
328
|
+
|
|
329
|
+
click.echo(format_preview(rows, start_row=start_row))
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@cli.command()
|
|
333
|
+
@click.argument("file")
|
|
334
|
+
@click.option("--config", "config_path", required=True,
|
|
335
|
+
type=click.Path(exists=True, dir_okay=False),
|
|
336
|
+
help="Path to pipeline.yaml")
|
|
337
|
+
@click.option("--format", "fmt", default=None, type=_FORMAT_CHOICE,
|
|
338
|
+
help="File format (auto-detected from extension)")
|
|
339
|
+
@click.option("--sample-rows", default=None, type=int, help="Validate only the first N rows")
|
|
340
|
+
@click.option("--json", "output_json", is_flag=True, help="Output as JSON to stdout")
|
|
341
|
+
@click.option("--encoding", default=None, help="Override file encoding from pipeline.yaml (e.g. cp500)")
|
|
342
|
+
def validate(file, config_path, fmt, sample_rows, output_json, encoding):
|
|
343
|
+
"""Validate a file against a pipeline.yaml schema without loading it."""
|
|
344
|
+
if fmt is None:
|
|
345
|
+
_, ext = os.path.splitext(file)
|
|
346
|
+
fmt = _EXT_TO_FORMAT.get(ext.lower())
|
|
347
|
+
if fmt is None:
|
|
348
|
+
click.echo(
|
|
349
|
+
f"Error: cannot detect format for {file!r}. "
|
|
350
|
+
f"Use --format csv or --format ndjson.",
|
|
351
|
+
err=True,
|
|
352
|
+
)
|
|
353
|
+
sys.exit(2)
|
|
354
|
+
|
|
355
|
+
try:
|
|
356
|
+
config = load_config(config_path)
|
|
357
|
+
effective_encoding = encoding or config.encoding
|
|
358
|
+
fs, path = get_filesystem(file)
|
|
359
|
+
parser = get_parser(fmt)
|
|
360
|
+
with open_file(path, fs=fs, mode=parser.mode, encoding=effective_encoding) as f:
|
|
361
|
+
rows = parser.parse(f)
|
|
362
|
+
if sample_rows is not None:
|
|
363
|
+
from itertools import islice
|
|
364
|
+
rows = islice(rows, sample_rows)
|
|
365
|
+
result = validate_file(rows, config.columns)
|
|
366
|
+
except Exception as e:
|
|
367
|
+
click.echo(f"Error: {e}", err=True)
|
|
368
|
+
sys.exit(2)
|
|
369
|
+
|
|
370
|
+
click.echo(format_text(result), err=True)
|
|
371
|
+
if output_json:
|
|
372
|
+
click.echo(json_lib.dumps(format_json(result)))
|
|
373
|
+
|
|
374
|
+
if result.failures:
|
|
375
|
+
sys.exit(1)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
@cli.command()
|
|
379
|
+
@click.option(
|
|
380
|
+
"--shell",
|
|
381
|
+
type=click.Choice(["zsh", "bash"]),
|
|
382
|
+
default=None,
|
|
383
|
+
help="Shell type (auto-detected from $SHELL if omitted)",
|
|
384
|
+
)
|
|
385
|
+
def completion(shell):
|
|
386
|
+
"""Print shell completion script.
|
|
387
|
+
|
|
388
|
+
\b
|
|
389
|
+
Zsh: filedge completion >> ~/.zshrc && source ~/.zshrc
|
|
390
|
+
Bash: filedge completion --shell bash >> ~/.bashrc && source ~/.bashrc
|
|
391
|
+
"""
|
|
392
|
+
if shell is None:
|
|
393
|
+
detected = os.environ.get("SHELL", "")
|
|
394
|
+
if "zsh" in detected:
|
|
395
|
+
shell = "zsh"
|
|
396
|
+
elif "bash" in detected:
|
|
397
|
+
shell = "bash"
|
|
398
|
+
else:
|
|
399
|
+
raise click.UsageError(
|
|
400
|
+
"Cannot detect shell from $SHELL. Use --shell zsh or --shell bash."
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
from click.shell_completion import BashComplete, ZshComplete
|
|
404
|
+
|
|
405
|
+
cls = ZshComplete if shell == "zsh" else BashComplete
|
|
406
|
+
click.echo(cls(cli, {}, "filedge", "_FILEDGE_COMPLETE").source(), nl=False)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
@cli.command()
|
|
410
|
+
@click.argument("filename", required=False)
|
|
411
|
+
@click.option("--hash", "content_hash", default=None,
|
|
412
|
+
help="Content hash to disambiguate when multiple records share the same filename")
|
|
413
|
+
@click.option("--all-terminal-failed", "all_terminal_failed", is_flag=True,
|
|
414
|
+
help="Requeue all terminal-FAILED files")
|
|
415
|
+
@click.option("--dry-run", is_flag=True,
|
|
416
|
+
help="List files that would be requeued without making changes (requires --all-terminal-failed)")
|
|
417
|
+
@click.option("--yes", is_flag=True,
|
|
418
|
+
help="Confirm bulk requeue (required with --all-terminal-failed)")
|
|
419
|
+
@click.option("--retry-cap", default=3, show_default=True,
|
|
420
|
+
help="Retry cap used to identify terminal-FAILED files; must match pipeline.yaml")
|
|
421
|
+
@click.option("--audit-db-url", required=True, envvar="FILEDGE_AUDIT_DB_URL",
|
|
422
|
+
help="Audit database URL")
|
|
423
|
+
def requeue(filename, content_hash, all_terminal_failed, dry_run, yes, retry_cap, audit_db_url):
|
|
424
|
+
"""Requeue terminal-FAILED files so they are retried on the next run.
|
|
425
|
+
|
|
426
|
+
\b
|
|
427
|
+
Single file:
|
|
428
|
+
filedge requeue orders.csv
|
|
429
|
+
filedge requeue orders.csv --hash a1b2c3... # disambiguate duplicate filenames
|
|
430
|
+
|
|
431
|
+
\b
|
|
432
|
+
Bulk:
|
|
433
|
+
filedge requeue --all-terminal-failed # preview count
|
|
434
|
+
filedge requeue --all-terminal-failed --dry-run # list affected files
|
|
435
|
+
filedge requeue --all-terminal-failed --yes # execute
|
|
436
|
+
"""
|
|
437
|
+
if filename and all_terminal_failed:
|
|
438
|
+
click.echo("Error: provide either a filename or --all-terminal-failed, not both.", err=True)
|
|
439
|
+
sys.exit(1)
|
|
440
|
+
if not filename and not all_terminal_failed:
|
|
441
|
+
click.echo("Error: provide a filename or --all-terminal-failed.", err=True)
|
|
442
|
+
sys.exit(1)
|
|
443
|
+
if filename and dry_run:
|
|
444
|
+
click.echo("Error: --dry-run is only valid with --all-terminal-failed.", err=True)
|
|
445
|
+
sys.exit(1)
|
|
446
|
+
if filename and yes:
|
|
447
|
+
click.echo("Error: --yes is only valid with --all-terminal-failed.", err=True)
|
|
448
|
+
sys.exit(1)
|
|
449
|
+
if dry_run and yes:
|
|
450
|
+
click.echo("Error: --dry-run and --yes are mutually exclusive.", err=True)
|
|
451
|
+
sys.exit(1)
|
|
452
|
+
|
|
453
|
+
db = Database(audit_db_url)
|
|
454
|
+
create_audit_tables(db)
|
|
455
|
+
|
|
456
|
+
try:
|
|
457
|
+
if all_terminal_failed:
|
|
458
|
+
records = list_terminal_failed(db, retry_cap)
|
|
459
|
+
|
|
460
|
+
if dry_run:
|
|
461
|
+
if not records:
|
|
462
|
+
click.echo("No terminal-FAILED files found.")
|
|
463
|
+
return
|
|
464
|
+
for r in records:
|
|
465
|
+
click.echo(f" {r.filename} {r.content_hash} {r.error_message or ''}")
|
|
466
|
+
click.echo(
|
|
467
|
+
f"\nWould requeue {len(records)} file(s). Re-run with --yes to proceed."
|
|
468
|
+
)
|
|
469
|
+
return
|
|
470
|
+
|
|
471
|
+
if not yes:
|
|
472
|
+
count = len(records)
|
|
473
|
+
if count == 0:
|
|
474
|
+
click.echo("No terminal-FAILED files found.")
|
|
475
|
+
return
|
|
476
|
+
click.echo(
|
|
477
|
+
f"Found {count} terminal-FAILED file(s). Re-run with --yes to requeue."
|
|
478
|
+
)
|
|
479
|
+
sys.exit(1)
|
|
480
|
+
|
|
481
|
+
n = requeue_all_terminal_failed(db, retry_cap)
|
|
482
|
+
db.commit()
|
|
483
|
+
click.echo(f"Requeued: {n}")
|
|
484
|
+
|
|
485
|
+
else:
|
|
486
|
+
if content_hash:
|
|
487
|
+
record = find_file_by_hash(db, content_hash)
|
|
488
|
+
if record is None:
|
|
489
|
+
click.echo(f"Error: no record found for hash {content_hash!r}.", err=True)
|
|
490
|
+
sys.exit(1)
|
|
491
|
+
if record.state != "FAILED" or record.attempt_count < retry_cap:
|
|
492
|
+
click.echo(
|
|
493
|
+
f"Error: {record.filename!r} is in state {record.state!r} with"
|
|
494
|
+
f" attempt_count={record.attempt_count} — not eligible for requeue"
|
|
495
|
+
f" (retry_cap={retry_cap}).",
|
|
496
|
+
err=True,
|
|
497
|
+
)
|
|
498
|
+
sys.exit(1)
|
|
499
|
+
requeue_by_hash(db, content_hash)
|
|
500
|
+
db.commit()
|
|
501
|
+
click.echo(f"Requeued: {record.filename} ({content_hash[:12]}…)")
|
|
502
|
+
else:
|
|
503
|
+
records = find_terminal_failed_by_filename(db, filename, retry_cap)
|
|
504
|
+
if not records:
|
|
505
|
+
click.echo(
|
|
506
|
+
f"Error: no terminal-FAILED record found for {filename!r}.", err=True
|
|
507
|
+
)
|
|
508
|
+
sys.exit(1)
|
|
509
|
+
if len(records) > 1:
|
|
510
|
+
click.echo(
|
|
511
|
+
f"Error: {len(records)} terminal-FAILED records found for {filename!r}."
|
|
512
|
+
f" Use --hash to disambiguate:",
|
|
513
|
+
err=True,
|
|
514
|
+
)
|
|
515
|
+
for r in records:
|
|
516
|
+
click.echo(
|
|
517
|
+
f" --hash {r.content_hash} (error: {r.error_message or 'unknown'})",
|
|
518
|
+
err=True,
|
|
519
|
+
)
|
|
520
|
+
sys.exit(1)
|
|
521
|
+
record = records[0]
|
|
522
|
+
requeue_by_hash(db, record.content_hash)
|
|
523
|
+
db.commit()
|
|
524
|
+
click.echo(f"Requeued: {record.filename} ({record.content_hash[:12]}…)")
|
|
525
|
+
finally:
|
|
526
|
+
db.close()
|
filedge/column_types.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
FILEDGE_TYPES = ("string", "integer", "float", "date", "timestamp", "boolean")
|
|
6
|
+
ISO_DATE_FORMAT = "YYYY-MM-DD"
|
|
7
|
+
|
|
8
|
+
_BOOLEAN_SETS = [
|
|
9
|
+
{"true", "false"},
|
|
10
|
+
{"yes", "no"},
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
_DATE_PATTERNS = [
|
|
14
|
+
(re.compile(r"^\d{4}-\d{2}-\d{2}$"), ISO_DATE_FORMAT),
|
|
15
|
+
(re.compile(r"^\d{2}/\d{2}/\d{4}$"), "MM/DD/YYYY"),
|
|
16
|
+
(re.compile(r"^\d{2}-\d{2}-\d{4}$"), "DD-MM-YYYY"),
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
_DATETIME_RE = re.compile(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def validate_column_type(column_type: str) -> None:
|
|
23
|
+
if column_type not in FILEDGE_TYPES:
|
|
24
|
+
raise ValueError(
|
|
25
|
+
f"Unknown Filedge column type {column_type!r}. "
|
|
26
|
+
f"Supported types: {', '.join(FILEDGE_TYPES)}"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def coerce_value(column_type: str, value: Any) -> Any:
|
|
31
|
+
validate_column_type(column_type)
|
|
32
|
+
if column_type == "string":
|
|
33
|
+
return str(value)
|
|
34
|
+
if column_type == "integer":
|
|
35
|
+
return int(value)
|
|
36
|
+
if column_type == "float":
|
|
37
|
+
return float(value)
|
|
38
|
+
if column_type == "boolean":
|
|
39
|
+
return coerce_boolean(value)
|
|
40
|
+
if column_type == "date":
|
|
41
|
+
return datetime.date.fromisoformat(str(value)).isoformat()
|
|
42
|
+
if column_type == "timestamp":
|
|
43
|
+
return datetime.datetime.fromisoformat(str(value)).isoformat()
|
|
44
|
+
raise AssertionError(f"Unhandled Filedge column type: {column_type}")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def coerce_boolean(value: Any) -> bool:
|
|
48
|
+
if isinstance(value, bool):
|
|
49
|
+
return value
|
|
50
|
+
normalized = str(value).strip().lower()
|
|
51
|
+
if normalized in ("true", "1", "yes"):
|
|
52
|
+
return True
|
|
53
|
+
if normalized in ("false", "0", "no"):
|
|
54
|
+
return False
|
|
55
|
+
raise ValueError(f"cannot interpret {value!r} as boolean")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def boolean_set(values: list[str]):
|
|
59
|
+
lower = {value.lower() for value in values}
|
|
60
|
+
for boolean_values in _BOOLEAN_SETS:
|
|
61
|
+
if lower <= boolean_values:
|
|
62
|
+
return boolean_values
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def date_format(value: str):
|
|
67
|
+
for pattern, label in _DATE_PATTERNS:
|
|
68
|
+
if pattern.match(value):
|
|
69
|
+
return label
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def has_time(value: str) -> bool:
|
|
74
|
+
return bool(_DATETIME_RE.match(value))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def date_like_note(formats: set[str]) -> str:
|
|
78
|
+
labels = ", ".join(sorted(formats))
|
|
79
|
+
return f"date-like {labels}; filedge date requires {ISO_DATE_FORMAT}"
|