filedge 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
filedge/__init__.py ADDED
File without changes
filedge/cdc.py ADDED
@@ -0,0 +1,49 @@
1
+ from dataclasses import dataclass
2
+ from typing import Iterable
3
+
4
+ from filedge.config import CdcConfig
5
+
6
+
7
+ class CdcError(Exception):
8
+ """Raised when CDC rows cannot be applied safely."""
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class CdcChange:
13
+ operation: str
14
+ key: tuple
15
+ row: dict
16
+
17
+
18
+ def plan_cdc_changes(rows: Iterable[dict], cdc: CdcConfig) -> list[CdcChange]:
19
+ operation_by_value = {
20
+ value: operation
21
+ for operation, values in cdc.operations.items()
22
+ for value in values
23
+ }
24
+ latest_by_key = {}
25
+
26
+ for row in rows:
27
+ raw_operation = row.get(cdc.operation_column)
28
+ operation = operation_by_value.get(raw_operation)
29
+ if operation is None:
30
+ raise CdcError(f"Unknown CDC operation: {raw_operation!r}")
31
+
32
+ key = tuple(row.get(column) for column in cdc.keys)
33
+ if any(value is None for value in key):
34
+ raise CdcError("CDC key columns cannot be null")
35
+
36
+ sequence = row.get(cdc.sequence_by)
37
+ if sequence is None:
38
+ raise CdcError(f"CDC sequence column {cdc.sequence_by!r} cannot be null")
39
+
40
+ existing = latest_by_key.get(key)
41
+ if existing is not None and existing[0] == sequence:
42
+ raise CdcError(f"Multiple CDC rows for key {key!r} have the same sequence")
43
+ if existing is None or sequence > existing[0]:
44
+ latest_by_key[key] = (sequence, operation, dict(row))
45
+
46
+ return [
47
+ CdcChange(operation=operation, key=key, row=row)
48
+ for key, (_sequence, operation, row) in latest_by_key.items()
49
+ ]
filedge/cli.py ADDED
@@ -0,0 +1,526 @@
1
+ import json as json_lib
2
+ import os
3
+ import sys
4
+
5
+ import click
6
+
7
+ from filedge.compactor import compact as run_compact
8
+ from filedge.connectors import SchemaError
9
+ from filedge.db import (
10
+ Database,
11
+ create_audit_tables,
12
+ find_file_by_hash,
13
+ find_terminal_failed_by_filename,
14
+ get_status_summary,
15
+ list_terminal_failed,
16
+ requeue_all_terminal_failed,
17
+ requeue_by_hash,
18
+ )
19
+ from filedge.filesystem import get_filesystem, open_file
20
+ from filedge.config import load_config
21
+ from filedge.health import HealthcheckError
22
+ from filedge.inferrer import infer_schema, infer_schema_from_parquet
23
+ from filedge.inspect_formatter import format_summary, format_yaml
24
+ from filedge.parser import get_parser
25
+ from filedge.preview_formatter import format_preview
26
+ from filedge.progress import RichPipelineProgress
27
+ from filedge.validate_formatter import format_json, format_text
28
+ from filedge.validator import validate_file
29
+ from filedge.pipeline import run_pipeline
30
+
31
+ _EXT_TO_FORMAT = {
32
+ ".csv": "csv",
33
+ ".ndjson": "ndjson",
34
+ ".jsonl": "ndjson",
35
+ ".parquet": "parquet",
36
+ }
37
+
38
+ _FORMAT_CHOICE = click.Choice(["csv", "ndjson", "parquet"])
39
+
40
+
41
+ @click.group()
42
+ def cli():
43
+ pass
44
+
45
+
46
+ @cli.command()
47
+ @click.option("--dir", "watched_dir", required=True,
48
+ type=click.Path(exists=True, file_okay=False, dir_okay=True),
49
+ help="Watched directory path")
50
+ @click.option("--config", "config_path", required=True,
51
+ type=click.Path(exists=True, dir_okay=False),
52
+ help="Path to pipeline.yaml")
53
+ @click.option("--audit-db-url", required=True, envvar="FILEDGE_AUDIT_DB_URL", help="Audit database URL")
54
+ @click.option(
55
+ "--progress/--no-progress",
56
+ "show_progress",
57
+ default=None,
58
+ help="Show live progress bars. Defaults to on for interactive terminals.",
59
+ )
60
+ @click.option("--json", "output_json", is_flag=True,
61
+ help="Write the Run summary as a single JSON line to stdout. Exit non-zero if any file failed.")
62
+ @click.option("--log-format", "log_format", type=click.Choice(["json", "text"]), default=None,
63
+ help="Log output format. Defaults to text on a TTY, json otherwise.")
64
+ @click.option("--log-level", "log_level", default="INFO", show_default=True,
65
+ help="Log level (DEBUG, INFO, WARNING, ERROR).")
66
+ @click.option("--otel-traces/--no-otel-traces", "otel_traces", default=None,
67
+ help="Enable OpenTelemetry tracing. Off by default. Also enabled by FILEDGE_OTEL_TRACES=true.")
68
+ @click.option("--otel-logs/--no-otel-logs", "otel_logs", default=None,
69
+ help="Export filedge logs through OpenTelemetry. Off by default. Also enabled by FILEDGE_OTEL_LOGS=true.")
70
+ def run(
71
+ watched_dir,
72
+ config_path,
73
+ audit_db_url,
74
+ show_progress,
75
+ output_json,
76
+ log_format,
77
+ log_level,
78
+ otel_traces,
79
+ otel_logs,
80
+ ):
81
+ """Run the ETL pipeline for a Watched Directory."""
82
+ from filedge.log import configure_logging, get_logger
83
+ from filedge.progress import LoggingProgressReporter
84
+ from filedge.tracing import (
85
+ configure_otel_logs,
86
+ configure_tracing,
87
+ should_enable_logs,
88
+ should_enable_tracing,
89
+ )
90
+
91
+ try:
92
+ is_tty = sys.stderr.isatty()
93
+ if show_progress is None:
94
+ show_progress = is_tty
95
+ if log_format is None:
96
+ log_format = "text" if is_tty else "json"
97
+
98
+ configure_logging(level=log_level, fmt=log_format)
99
+
100
+ tracing_on = should_enable_tracing(
101
+ cli_flag=otel_traces,
102
+ env_value=os.environ.get("FILEDGE_OTEL_TRACES"),
103
+ )
104
+ configure_tracing(enabled=tracing_on)
105
+ logs_on = should_enable_logs(
106
+ cli_flag=otel_logs,
107
+ env_value=os.environ.get("FILEDGE_OTEL_LOGS"),
108
+ )
109
+ configure_otel_logs(enabled=logs_on)
110
+
111
+ run_id = _new_run_id()
112
+ log_reporter = LoggingProgressReporter(get_logger("filedge.pipeline"), run_id=run_id)
113
+
114
+ from contextlib import ExitStack
115
+ with ExitStack() as stack:
116
+ handlers = [log_reporter.handle]
117
+
118
+ tracing_reporter = None
119
+ if tracing_on:
120
+ from filedge.progress import TracingProgressReporter
121
+ tracing_reporter = stack.enter_context(TracingProgressReporter(run_id=run_id))
122
+ handlers.append(tracing_reporter.handle)
123
+
124
+ if show_progress:
125
+ from rich.console import Console
126
+ rich_progress = stack.enter_context(RichPipelineProgress(Console(stderr=True)))
127
+ handlers.insert(0, rich_progress.handle)
128
+
129
+ result = run_pipeline(
130
+ watched_dir, config_path, audit_db_url,
131
+ progress=_tee(*handlers), run_id=run_id,
132
+ )
133
+ if tracing_reporter is not None:
134
+ tracing_reporter.set_run_attributes(result)
135
+
136
+ if output_json:
137
+ click.echo(json_lib.dumps(result))
138
+ else:
139
+ click.echo(
140
+ f"Committed: {result['committed']} "
141
+ f"Failed: {result['failed']} "
142
+ f"Skipped: {result['skipped']} "
143
+ f"New: {result['new_files']} "
144
+ f"Reclaimed: {result['reclaimed']} "
145
+ f"Retried: {result['retried']}"
146
+ )
147
+
148
+ if result["failed"] > 0:
149
+ sys.exit(1)
150
+ except SchemaError as e:
151
+ click.echo(f"Schema error: {e}", err=True)
152
+ sys.exit(1)
153
+ except HealthcheckError as e:
154
+ click.echo(str(e), err=True)
155
+ sys.exit(1)
156
+ except Exception as e:
157
+ click.echo(f"Error: {e}", err=True)
158
+ sys.exit(1)
159
+
160
+ def _new_run_id() -> str:
161
+ import uuid
162
+ return str(uuid.uuid4())
163
+
164
+
165
+ def _tee(*handlers):
166
+ def fanout(event):
167
+ for h in handlers:
168
+ h(event)
169
+ return fanout
170
+
171
+
172
+ @cli.command()
173
+ @click.option("--config", "config_path", required=True,
174
+ type=click.Path(exists=True, dir_okay=False),
175
+ help="Path to pipeline.yaml")
176
+ @click.option("--audit-db-url", required=True, envvar="FILEDGE_AUDIT_DB_URL", help="Audit database URL")
177
+ @click.option("--json", "output_json", is_flag=True,
178
+ help="Write health status as a JSON object to stdout.")
179
+ def healthcheck(config_path, audit_db_url, output_json):
180
+ """Probe the Audit DB and destination connector without writing data."""
181
+ from filedge.health import run_healthchecks
182
+
183
+ try:
184
+ report = run_healthchecks(load_config(config_path), audit_db_url)
185
+ except Exception as e:
186
+ click.echo(f"Healthcheck failed: configuration unreachable: {e}", err=True)
187
+ sys.exit(1)
188
+
189
+ if output_json:
190
+ click.echo(json_lib.dumps(report))
191
+ else:
192
+ for check in report["checks"]:
193
+ if check["ok"]:
194
+ click.echo(f"{check['name']}: ok ({check['latency_ms']} ms)")
195
+ else:
196
+ click.echo(
197
+ f"{check['name']}: unreachable: {check['error']}",
198
+ err=True,
199
+ )
200
+
201
+ if not report["healthy"]:
202
+ sys.exit(1)
203
+
204
+
205
+ @cli.command()
206
+ @click.option("--watched-dir", required=True,
207
+ type=click.Path(exists=True, file_okay=False, dir_okay=True),
208
+ help="Source prefix containing small files")
209
+ @click.option("--output", required=True, help="Output prefix for compacted files")
210
+ @click.option("--max-files", default=1000, show_default=True, help="Max input files per output file")
211
+ @click.option("--compress", is_flag=True, help="Gzip-compress output (.ndjson.gz)")
212
+ @click.option("--delete-source", is_flag=True,
213
+ help="Delete source files after each batch commits (requires delete permission).")
214
+ def compact(watched_dir, output, max_files, compress, delete_source):
215
+ """Merge small NDJSON files into fewer larger files before ingestion."""
216
+ try:
217
+ result = run_compact(watched_dir, output, max_files=max_files, compress=compress,
218
+ delete_source=delete_source)
219
+ click.echo(
220
+ f"Batches written: {result['batches']} "
221
+ f"Files compacted: {result['files_compacted']}"
222
+ )
223
+ except Exception as e:
224
+ click.echo(f"Error: {e}", err=True)
225
+ sys.exit(1)
226
+
227
+
228
+ @cli.command()
229
+ @click.option("--audit-db-url", required=True, envvar="FILEDGE_AUDIT_DB_URL", help="Audit database URL")
230
+ @click.option("--json", "output_json", is_flag=True, help="Output as JSON")
231
+ def status(audit_db_url, output_json):
232
+ """Show pipeline status summary."""
233
+ db = Database(audit_db_url)
234
+ create_audit_tables(db)
235
+ summary = get_status_summary(db)
236
+ db.close()
237
+
238
+ if output_json:
239
+ click.echo(json_lib.dumps(summary, indent=2))
240
+ else:
241
+ click.echo(f"PENDING: {summary['PENDING']}")
242
+ click.echo(f"PROCESSING: {summary['PROCESSING']}")
243
+ click.echo(f"COMMITTED: {summary['COMMITTED']}")
244
+ click.echo(f"FAILED: {summary['FAILED']}")
245
+ if summary["recent_failures"]:
246
+ click.echo("\nRecent failures:")
247
+ for f in summary["recent_failures"]:
248
+ click.echo(f" {f['filename']}: {f['error_message']}")
249
+
250
+
251
+ @cli.command()
252
+ @click.argument("file")
253
+ @click.option("--format", "fmt", default=None, type=_FORMAT_CHOICE,
254
+ help="File format (auto-detected from extension)")
255
+ @click.option("--sample-rows", default=1000, show_default=True, help="Number of rows to sample")
256
+ @click.option("--output", "output_path", default=None,
257
+ type=click.Path(dir_okay=False),
258
+ help="Write YAML block to this file instead of stdout")
259
+ @click.option("--encoding", default="utf-8", show_default=True, help="File encoding (e.g. utf-8, cp500, latin-1)")
260
+ def inspect(file, fmt, sample_rows, output_path, encoding):
261
+ """Infer schema from a file and output a columns: block for pipeline.yaml."""
262
+ if fmt is None:
263
+ _, ext = os.path.splitext(file)
264
+ fmt = _EXT_TO_FORMAT.get(ext.lower())
265
+ if fmt is None:
266
+ click.echo(
267
+ f"Error: cannot detect format for {file!r}. "
268
+ f"Use --format csv or --format ndjson.",
269
+ err=True,
270
+ )
271
+ sys.exit(1)
272
+
273
+ try:
274
+ fs, path = get_filesystem(file)
275
+ if fmt == "parquet":
276
+ import pyarrow.parquet as pq
277
+ with open_file(path, fs=fs, mode="rb") as f:
278
+ columns = infer_schema_from_parquet(pq.ParquetFile(f).schema_arrow)
279
+ else:
280
+ parser = get_parser(fmt)
281
+ with open_file(path, fs=fs, encoding=encoding) as f:
282
+ columns = infer_schema(parser.parse(f), sample_rows=sample_rows)
283
+ except Exception as e:
284
+ click.echo(f"Error: {e}", err=True)
285
+ sys.exit(1)
286
+
287
+ yaml_block = format_yaml(columns, source_path=file, sample_rows=sample_rows)
288
+ summary = format_summary(columns)
289
+
290
+ click.echo(summary, err=True)
291
+
292
+ if output_path:
293
+ with open(output_path, "w", encoding="utf-8") as f:
294
+ f.write(yaml_block)
295
+ else:
296
+ click.echo(yaml_block, nl=False)
297
+
298
+
299
+ @cli.command()
300
+ @click.argument("file")
301
+ @click.option("--format", "fmt", default=None, type=_FORMAT_CHOICE,
302
+ help="File format (auto-detected from extension)")
303
+ @click.option("--rows", "num_rows", default=10, show_default=True, help="Number of rows to display")
304
+ @click.option("--start-row", "start_row", default=1, show_default=True, help="First row to display (1-indexed)")
305
+ @click.option("--encoding", default="utf-8", show_default=True, help="File encoding (e.g. utf-8, cp500, latin-1)")
306
+ def preview(file, fmt, num_rows, start_row, encoding):
307
+ """Show N rows of a file as a formatted table, optionally starting at a given row."""
308
+ if fmt is None:
309
+ _, ext = os.path.splitext(file)
310
+ fmt = _EXT_TO_FORMAT.get(ext.lower())
311
+ if fmt is None:
312
+ click.echo(
313
+ f"Error: cannot detect format for {file!r}. "
314
+ f"Use --format csv or --format ndjson.",
315
+ err=True,
316
+ )
317
+ sys.exit(2)
318
+
319
+ try:
320
+ from itertools import islice
321
+ fs, path = get_filesystem(file)
322
+ parser = get_parser(fmt)
323
+ with open_file(path, fs=fs, mode=parser.mode, encoding=encoding) as f:
324
+ rows = list(islice(parser.parse(f), start_row - 1, start_row - 1 + num_rows))
325
+ except Exception as e:
326
+ click.echo(f"Error: {e}", err=True)
327
+ sys.exit(2)
328
+
329
+ click.echo(format_preview(rows, start_row=start_row))
330
+
331
+
332
+ @cli.command()
333
+ @click.argument("file")
334
+ @click.option("--config", "config_path", required=True,
335
+ type=click.Path(exists=True, dir_okay=False),
336
+ help="Path to pipeline.yaml")
337
+ @click.option("--format", "fmt", default=None, type=_FORMAT_CHOICE,
338
+ help="File format (auto-detected from extension)")
339
+ @click.option("--sample-rows", default=None, type=int, help="Validate only the first N rows")
340
+ @click.option("--json", "output_json", is_flag=True, help="Output as JSON to stdout")
341
+ @click.option("--encoding", default=None, help="Override file encoding from pipeline.yaml (e.g. cp500)")
342
+ def validate(file, config_path, fmt, sample_rows, output_json, encoding):
343
+ """Validate a file against a pipeline.yaml schema without loading it."""
344
+ if fmt is None:
345
+ _, ext = os.path.splitext(file)
346
+ fmt = _EXT_TO_FORMAT.get(ext.lower())
347
+ if fmt is None:
348
+ click.echo(
349
+ f"Error: cannot detect format for {file!r}. "
350
+ f"Use --format csv or --format ndjson.",
351
+ err=True,
352
+ )
353
+ sys.exit(2)
354
+
355
+ try:
356
+ config = load_config(config_path)
357
+ effective_encoding = encoding or config.encoding
358
+ fs, path = get_filesystem(file)
359
+ parser = get_parser(fmt)
360
+ with open_file(path, fs=fs, mode=parser.mode, encoding=effective_encoding) as f:
361
+ rows = parser.parse(f)
362
+ if sample_rows is not None:
363
+ from itertools import islice
364
+ rows = islice(rows, sample_rows)
365
+ result = validate_file(rows, config.columns)
366
+ except Exception as e:
367
+ click.echo(f"Error: {e}", err=True)
368
+ sys.exit(2)
369
+
370
+ click.echo(format_text(result), err=True)
371
+ if output_json:
372
+ click.echo(json_lib.dumps(format_json(result)))
373
+
374
+ if result.failures:
375
+ sys.exit(1)
376
+
377
+
378
+ @cli.command()
379
+ @click.option(
380
+ "--shell",
381
+ type=click.Choice(["zsh", "bash"]),
382
+ default=None,
383
+ help="Shell type (auto-detected from $SHELL if omitted)",
384
+ )
385
+ def completion(shell):
386
+ """Print shell completion script.
387
+
388
+ \b
389
+ Zsh: filedge completion >> ~/.zshrc && source ~/.zshrc
390
+ Bash: filedge completion --shell bash >> ~/.bashrc && source ~/.bashrc
391
+ """
392
+ if shell is None:
393
+ detected = os.environ.get("SHELL", "")
394
+ if "zsh" in detected:
395
+ shell = "zsh"
396
+ elif "bash" in detected:
397
+ shell = "bash"
398
+ else:
399
+ raise click.UsageError(
400
+ "Cannot detect shell from $SHELL. Use --shell zsh or --shell bash."
401
+ )
402
+
403
+ from click.shell_completion import BashComplete, ZshComplete
404
+
405
+ cls = ZshComplete if shell == "zsh" else BashComplete
406
+ click.echo(cls(cli, {}, "filedge", "_FILEDGE_COMPLETE").source(), nl=False)
407
+
408
+
409
+ @cli.command()
410
+ @click.argument("filename", required=False)
411
+ @click.option("--hash", "content_hash", default=None,
412
+ help="Content hash to disambiguate when multiple records share the same filename")
413
+ @click.option("--all-terminal-failed", "all_terminal_failed", is_flag=True,
414
+ help="Requeue all terminal-FAILED files")
415
+ @click.option("--dry-run", is_flag=True,
416
+ help="List files that would be requeued without making changes (requires --all-terminal-failed)")
417
+ @click.option("--yes", is_flag=True,
418
+ help="Confirm bulk requeue (required with --all-terminal-failed)")
419
+ @click.option("--retry-cap", default=3, show_default=True,
420
+ help="Retry cap used to identify terminal-FAILED files; must match pipeline.yaml")
421
+ @click.option("--audit-db-url", required=True, envvar="FILEDGE_AUDIT_DB_URL",
422
+ help="Audit database URL")
423
+ def requeue(filename, content_hash, all_terminal_failed, dry_run, yes, retry_cap, audit_db_url):
424
+ """Requeue terminal-FAILED files so they are retried on the next run.
425
+
426
+ \b
427
+ Single file:
428
+ filedge requeue orders.csv
429
+ filedge requeue orders.csv --hash a1b2c3... # disambiguate duplicate filenames
430
+
431
+ \b
432
+ Bulk:
433
+ filedge requeue --all-terminal-failed # preview count
434
+ filedge requeue --all-terminal-failed --dry-run # list affected files
435
+ filedge requeue --all-terminal-failed --yes # execute
436
+ """
437
+ if filename and all_terminal_failed:
438
+ click.echo("Error: provide either a filename or --all-terminal-failed, not both.", err=True)
439
+ sys.exit(1)
440
+ if not filename and not all_terminal_failed:
441
+ click.echo("Error: provide a filename or --all-terminal-failed.", err=True)
442
+ sys.exit(1)
443
+ if filename and dry_run:
444
+ click.echo("Error: --dry-run is only valid with --all-terminal-failed.", err=True)
445
+ sys.exit(1)
446
+ if filename and yes:
447
+ click.echo("Error: --yes is only valid with --all-terminal-failed.", err=True)
448
+ sys.exit(1)
449
+ if dry_run and yes:
450
+ click.echo("Error: --dry-run and --yes are mutually exclusive.", err=True)
451
+ sys.exit(1)
452
+
453
+ db = Database(audit_db_url)
454
+ create_audit_tables(db)
455
+
456
+ try:
457
+ if all_terminal_failed:
458
+ records = list_terminal_failed(db, retry_cap)
459
+
460
+ if dry_run:
461
+ if not records:
462
+ click.echo("No terminal-FAILED files found.")
463
+ return
464
+ for r in records:
465
+ click.echo(f" {r.filename} {r.content_hash} {r.error_message or ''}")
466
+ click.echo(
467
+ f"\nWould requeue {len(records)} file(s). Re-run with --yes to proceed."
468
+ )
469
+ return
470
+
471
+ if not yes:
472
+ count = len(records)
473
+ if count == 0:
474
+ click.echo("No terminal-FAILED files found.")
475
+ return
476
+ click.echo(
477
+ f"Found {count} terminal-FAILED file(s). Re-run with --yes to requeue."
478
+ )
479
+ sys.exit(1)
480
+
481
+ n = requeue_all_terminal_failed(db, retry_cap)
482
+ db.commit()
483
+ click.echo(f"Requeued: {n}")
484
+
485
+ else:
486
+ if content_hash:
487
+ record = find_file_by_hash(db, content_hash)
488
+ if record is None:
489
+ click.echo(f"Error: no record found for hash {content_hash!r}.", err=True)
490
+ sys.exit(1)
491
+ if record.state != "FAILED" or record.attempt_count < retry_cap:
492
+ click.echo(
493
+ f"Error: {record.filename!r} is in state {record.state!r} with"
494
+ f" attempt_count={record.attempt_count} — not eligible for requeue"
495
+ f" (retry_cap={retry_cap}).",
496
+ err=True,
497
+ )
498
+ sys.exit(1)
499
+ requeue_by_hash(db, content_hash)
500
+ db.commit()
501
+ click.echo(f"Requeued: {record.filename} ({content_hash[:12]}…)")
502
+ else:
503
+ records = find_terminal_failed_by_filename(db, filename, retry_cap)
504
+ if not records:
505
+ click.echo(
506
+ f"Error: no terminal-FAILED record found for {filename!r}.", err=True
507
+ )
508
+ sys.exit(1)
509
+ if len(records) > 1:
510
+ click.echo(
511
+ f"Error: {len(records)} terminal-FAILED records found for {filename!r}."
512
+ f" Use --hash to disambiguate:",
513
+ err=True,
514
+ )
515
+ for r in records:
516
+ click.echo(
517
+ f" --hash {r.content_hash} (error: {r.error_message or 'unknown'})",
518
+ err=True,
519
+ )
520
+ sys.exit(1)
521
+ record = records[0]
522
+ requeue_by_hash(db, record.content_hash)
523
+ db.commit()
524
+ click.echo(f"Requeued: {record.filename} ({record.content_hash[:12]}…)")
525
+ finally:
526
+ db.close()
@@ -0,0 +1,79 @@
1
+ import datetime
2
+ import re
3
+ from typing import Any
4
+
5
+ FILEDGE_TYPES = ("string", "integer", "float", "date", "timestamp", "boolean")
6
+ ISO_DATE_FORMAT = "YYYY-MM-DD"
7
+
8
+ _BOOLEAN_SETS = [
9
+ {"true", "false"},
10
+ {"yes", "no"},
11
+ ]
12
+
13
+ _DATE_PATTERNS = [
14
+ (re.compile(r"^\d{4}-\d{2}-\d{2}$"), ISO_DATE_FORMAT),
15
+ (re.compile(r"^\d{2}/\d{2}/\d{4}$"), "MM/DD/YYYY"),
16
+ (re.compile(r"^\d{2}-\d{2}-\d{4}$"), "DD-MM-YYYY"),
17
+ ]
18
+
19
+ _DATETIME_RE = re.compile(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}")
20
+
21
+
22
+ def validate_column_type(column_type: str) -> None:
23
+ if column_type not in FILEDGE_TYPES:
24
+ raise ValueError(
25
+ f"Unknown Filedge column type {column_type!r}. "
26
+ f"Supported types: {', '.join(FILEDGE_TYPES)}"
27
+ )
28
+
29
+
30
+ def coerce_value(column_type: str, value: Any) -> Any:
31
+ validate_column_type(column_type)
32
+ if column_type == "string":
33
+ return str(value)
34
+ if column_type == "integer":
35
+ return int(value)
36
+ if column_type == "float":
37
+ return float(value)
38
+ if column_type == "boolean":
39
+ return coerce_boolean(value)
40
+ if column_type == "date":
41
+ return datetime.date.fromisoformat(str(value)).isoformat()
42
+ if column_type == "timestamp":
43
+ return datetime.datetime.fromisoformat(str(value)).isoformat()
44
+ raise AssertionError(f"Unhandled Filedge column type: {column_type}")
45
+
46
+
47
+ def coerce_boolean(value: Any) -> bool:
48
+ if isinstance(value, bool):
49
+ return value
50
+ normalized = str(value).strip().lower()
51
+ if normalized in ("true", "1", "yes"):
52
+ return True
53
+ if normalized in ("false", "0", "no"):
54
+ return False
55
+ raise ValueError(f"cannot interpret {value!r} as boolean")
56
+
57
+
58
+ def boolean_set(values: list[str]):
59
+ lower = {value.lower() for value in values}
60
+ for boolean_values in _BOOLEAN_SETS:
61
+ if lower <= boolean_values:
62
+ return boolean_values
63
+ return None
64
+
65
+
66
+ def date_format(value: str):
67
+ for pattern, label in _DATE_PATTERNS:
68
+ if pattern.match(value):
69
+ return label
70
+ return None
71
+
72
+
73
+ def has_time(value: str) -> bool:
74
+ return bool(_DATETIME_RE.match(value))
75
+
76
+
77
+ def date_like_note(formats: set[str]) -> str:
78
+ labels = ", ".join(sorted(formats))
79
+ return f"date-like {labels}; filedge date requires {ISO_DATE_FORMAT}"