expunct-cli 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/PKG-INFO +77 -3
  2. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/README.md +73 -0
  3. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/pyproject.toml +4 -3
  4. expunct_cli-0.2.0/src/expunct_cli/__init__.py +1 -0
  5. expunct_cli-0.2.0/src/expunct_cli/commands/extract.py +216 -0
  6. expunct_cli-0.2.0/src/expunct_cli/commands/parse.py +115 -0
  7. expunct_cli-0.2.0/src/expunct_cli/commands/safe_parse.py +118 -0
  8. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/main.py +18 -0
  9. expunct_cli-0.1.0/src/expunct_cli/__init__.py +0 -1
  10. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/.github/workflows/ci.yml +0 -0
  11. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/.github/workflows/publish.yml +0 -0
  12. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/.gitignore +0 -0
  13. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/client.py +0 -0
  14. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/__init__.py +0 -0
  15. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/audit.py +0 -0
  16. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/config_cmd.py +0 -0
  17. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/detect.py +0 -0
  18. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/jobs.py +0 -0
  19. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/policies.py +0 -0
  20. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/redact.py +0 -0
  21. {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/output.py +0 -0
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: expunct-cli
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: CLI for the Expunct PII redaction API — redact, detect, and manage sensitive data from the command line.
5
5
  Project-URL: Homepage, https://expunct.ai
6
6
  Project-URL: Documentation, https://docs.expunct.ai
7
7
  Project-URL: Repository, https://github.com/expunct/cli
8
8
  Author: Expunct
9
9
  License-Expression: MIT
10
- Keywords: cli,pii,privacy,redaction,security
10
+ Keywords: ai,cli,gdpr,hipaa,llm,pdf,pii,privacy,rag,redaction,security
11
11
  Classifier: Development Status :: 4 - Beta
12
12
  Classifier: Environment :: Console
13
13
  Classifier: Intended Audience :: Developers
@@ -18,10 +18,11 @@ Classifier: Programming Language :: Python :: 3.10
18
18
  Classifier: Programming Language :: Python :: 3.11
19
19
  Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
22
  Classifier: Topic :: Security
22
23
  Classifier: Topic :: Software Development :: Libraries
23
24
  Requires-Python: >=3.10
24
- Requires-Dist: expunct>=0.1.1
25
+ Requires-Dist: expunct>=0.2.0
25
26
  Requires-Dist: rich>=13.0
26
27
  Requires-Dist: typer>=0.15
27
28
  Provides-Extra: dev
@@ -212,6 +213,79 @@ expunct audit list --event-type redaction --page-size 50
212
213
  expunct audit list --json
213
214
  ```
214
215
 
216
+ ### Document Intelligence commands (beta)
217
+
218
+ `parse`, `extract`, and `safe-parse` call the Expunct Document Intelligence API,
219
+ which is currently in **beta and feature-flag gated**. Your tenant must be
220
+ enabled before these commands will succeed; calls from disabled tenants return a
221
+ `403`. PDF and DOCX are the supported formats during beta.
222
+
223
+ ### `expunct parse`
224
+
225
+ Parse a PDF or DOCX into a canonical document structure (text, tables, headings, layout).
226
+
227
+ ```bash
228
+ expunct parse report.pdf
229
+ expunct parse contract.docx --language en
230
+ expunct parse report.pdf --no-wait # submit and return job ID immediately
231
+ expunct parse report.pdf --json # raw JSON output
232
+ ```
233
+
234
+ | Flag | Description |
235
+ |------|-------------|
236
+ | `--language, -l` | Language code (default: `en`) |
237
+ | `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
238
+ | `--timeout` | Wait timeout in seconds (default: `300`) |
239
+ | `--json` | Raw JSON output |
240
+
241
+ ### `expunct extract`
242
+
243
+ Extract structured fields from a parsed document using a JSON Schema or built-in template.
244
+
245
+ ```bash
246
+ # From an existing parse artifact (preferred — avoids re-parsing)
247
+ expunct extract art-abc123 --schema invoice_schema.json
248
+
249
+ # Parse and extract in one step
250
+ expunct extract --file report.pdf --template invoice
251
+
252
+ # Inline schema
253
+ expunct extract art-abc123 --schema-json '{"type":"object","properties":{"total":{"type":"number"}}}'
254
+ ```
255
+
256
+ | Flag | Description |
257
+ |------|-------------|
258
+ | `PARSE_ARTIFACT_ID` | Canonical document artifact ID from a completed parse job |
259
+ | `--file, -f` | PDF or DOCX to parse and extract in one step (mutually exclusive with artifact ID) |
260
+ | `--schema, -s` | Path to JSON Schema file defining fields to extract |
261
+ | `--schema-json` | Inline JSON Schema string |
262
+ | `--template, -t` | Built-in template ID (e.g. `invoice`) |
263
+ | `--language, -l` | Language code (default: `en`) |
264
+ | `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
265
+ | `--timeout` | Wait timeout in seconds (default: `300`) |
266
+ | `--json` | Raw JSON output |
267
+
268
+ ### `expunct safe-parse`
269
+
270
+ Parse a PDF or DOCX and sanitize PII in a single workflow. Produces sanitized
271
+ canonical document, sanitized markdown, and sanitized chunk artifacts suitable
272
+ for AI ingestion (RAG, prompts, embeddings).
273
+
274
+ ```bash
275
+ expunct safe-parse contract.pdf
276
+ expunct safe-parse report.docx --policy-id strict
277
+ expunct safe-parse contract.pdf --no-wait # submit and return job ID immediately
278
+ expunct safe-parse contract.pdf --json # raw JSON output
279
+ ```
280
+
281
+ | Flag | Description |
282
+ |------|-------------|
283
+ | `--policy-id, -p` | Redaction policy ID applied during sanitization |
284
+ | `--language, -l` | Language code (default: `en`) |
285
+ | `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
286
+ | `--timeout` | Wait timeout in seconds (default: `300`) |
287
+ | `--json` | Raw JSON output |
288
+
215
289
  ### `expunct config`
216
290
 
217
291
  Manage CLI configuration.
@@ -181,6 +181,79 @@ expunct audit list --event-type redaction --page-size 50
181
181
  expunct audit list --json
182
182
  ```
183
183
 
184
+ ### Document Intelligence commands (beta)
185
+
186
+ `parse`, `extract`, and `safe-parse` call the Expunct Document Intelligence API,
187
+ which is currently in **beta and feature-flag gated**. Your tenant must be
188
+ enabled before these commands will succeed; calls from disabled tenants return a
189
+ `403`. PDF and DOCX are the supported formats during beta.
190
+
191
+ ### `expunct parse`
192
+
193
+ Parse a PDF or DOCX into a canonical document structure (text, tables, headings, layout).
194
+
195
+ ```bash
196
+ expunct parse report.pdf
197
+ expunct parse contract.docx --language en
198
+ expunct parse report.pdf --no-wait # submit and return job ID immediately
199
+ expunct parse report.pdf --json # raw JSON output
200
+ ```
201
+
202
+ | Flag | Description |
203
+ |------|-------------|
204
+ | `--language, -l` | Language code (default: `en`) |
205
+ | `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
206
+ | `--timeout` | Wait timeout in seconds (default: `300`) |
207
+ | `--json` | Raw JSON output |
208
+
209
+ ### `expunct extract`
210
+
211
+ Extract structured fields from a parsed document using a JSON Schema or built-in template.
212
+
213
+ ```bash
214
+ # From an existing parse artifact (preferred — avoids re-parsing)
215
+ expunct extract art-abc123 --schema invoice_schema.json
216
+
217
+ # Parse and extract in one step
218
+ expunct extract --file report.pdf --template invoice
219
+
220
+ # Inline schema
221
+ expunct extract art-abc123 --schema-json '{"type":"object","properties":{"total":{"type":"number"}}}'
222
+ ```
223
+
224
+ | Flag | Description |
225
+ |------|-------------|
226
+ | `PARSE_ARTIFACT_ID` | Canonical document artifact ID from a completed parse job |
227
+ | `--file, -f` | PDF or DOCX to parse and extract in one step (mutually exclusive with artifact ID) |
228
+ | `--schema, -s` | Path to JSON Schema file defining fields to extract |
229
+ | `--schema-json` | Inline JSON Schema string |
230
+ | `--template, -t` | Built-in template ID (e.g. `invoice`) |
231
+ | `--language, -l` | Language code (default: `en`) |
232
+ | `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
233
+ | `--timeout` | Wait timeout in seconds (default: `300`) |
234
+ | `--json` | Raw JSON output |
235
+
236
+ ### `expunct safe-parse`
237
+
238
+ Parse a PDF or DOCX and sanitize PII in a single workflow. Produces sanitized
239
+ canonical document, sanitized markdown, and sanitized chunk artifacts suitable
240
+ for AI ingestion (RAG, prompts, embeddings).
241
+
242
+ ```bash
243
+ expunct safe-parse contract.pdf
244
+ expunct safe-parse report.docx --policy-id strict
245
+ expunct safe-parse contract.pdf --no-wait # submit and return job ID immediately
246
+ expunct safe-parse contract.pdf --json # raw JSON output
247
+ ```
248
+
249
+ | Flag | Description |
250
+ |------|-------------|
251
+ | `--policy-id, -p` | Redaction policy ID applied during sanitization |
252
+ | `--language, -l` | Language code (default: `en`) |
253
+ | `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
254
+ | `--timeout` | Wait timeout in seconds (default: `300`) |
255
+ | `--json` | Raw JSON output |
256
+
184
257
  ### `expunct config`
185
258
 
186
259
  Manage CLI configuration.
@@ -4,13 +4,13 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "expunct-cli"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "CLI for the Expunct PII redaction API — redact, detect, and manage sensitive data from the command line."
9
9
  readme = "README.md"
10
10
  license = "MIT"
11
11
  requires-python = ">=3.10"
12
12
  authors = [{ name = "Expunct" }]
13
- keywords = ["pii", "redaction", "privacy", "cli", "security"]
13
+ keywords = ["pii", "redaction", "privacy", "cli", "security", "gdpr", "hipaa", "llm", "rag", "ai", "pdf"]
14
14
  classifiers = [
15
15
  "Development Status :: 4 - Beta",
16
16
  "Environment :: Console",
@@ -23,10 +23,11 @@ classifiers = [
23
23
  "Programming Language :: Python :: 3.12",
24
24
  "Programming Language :: Python :: 3.13",
25
25
  "Topic :: Security",
26
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
26
27
  "Topic :: Software Development :: Libraries",
27
28
  ]
28
29
  dependencies = [
29
- "expunct>=0.1.1",
30
+ "expunct>=0.2.0",
30
31
  "typer>=0.15",
31
32
  "rich>=13.0",
32
33
  ]
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"
@@ -0,0 +1,216 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ import typer
8
+ from expunct import (
9
+ ApiError,
10
+ AuthenticationError,
11
+ NotFoundError,
12
+ PollingTimeoutError,
13
+ RateLimitError,
14
+ ValidationError,
15
+ )
16
+
17
+ from expunct_cli.client import get_client
18
+ from expunct_cli.output import console, print_error, print_json
19
+
20
+ app = typer.Typer(no_args_is_help=False, invoke_without_command=True)
21
+
22
+
23
+ @app.callback(invoke_without_command=True)
24
+ def extract(
25
+ ctx: typer.Context,
26
+ parse_artifact_id: str | None = typer.Argument(
27
+ None,
28
+ help="Canonical document artifact ID from a completed parse job. "
29
+ "Mutually exclusive with --file.",
30
+ ),
31
+ file: Path | None = typer.Option(
32
+ None, "--file", "-f",
33
+ help="PDF or DOCX file to parse and extract in one step. "
34
+ "Mutually exclusive with PARSE_ARTIFACT_ID.",
35
+ ),
36
+ schema: Path | None = typer.Option(
37
+ None, "--schema", "-s",
38
+ help="Path to a JSON Schema file defining the fields to extract.",
39
+ ),
40
+ schema_json: str | None = typer.Option(
41
+ None, "--schema-json",
42
+ help="Inline JSON Schema string (alternative to --schema).",
43
+ ),
44
+ template_id: str | None = typer.Option(
45
+ None, "--template", "-t",
46
+ help="Built-in extraction template ID (e.g. 'invoice'). "
47
+ "Mutually exclusive with --schema / --schema-json.",
48
+ ),
49
+ language: str = typer.Option("en", "--language", "-l", help="Language code."),
50
+ wait: bool = typer.Option(True, "--wait/--no-wait", help="Wait for job to complete."),
51
+ timeout: int = typer.Option(300, "--timeout", help="Timeout in seconds when waiting."),
52
+ json_output: bool = typer.Option(False, "--json", help="Output raw JSON."),
53
+ ) -> None:
54
+ """Extract structured fields from a parsed document.
55
+
56
+ Provide exactly one source: a PARSE_ARTIFACT_ID (canonical document artifact
57
+ from a completed parse job) or --file (parses and extracts in one step).
58
+
59
+ Provide exactly one schema: --schema / --schema-json or --template.
60
+
61
+ Examples:
62
+
63
+ expunct extract art-abc123 --schema invoice_schema.json
64
+
65
+ expunct extract --file report.pdf --template invoice
66
+
67
+ expunct extract art-abc123 \\
68
+ --schema-json '{"type":"object","properties":{"total":{"type":"number"}}}'
69
+ """
70
+ if ctx.invoked_subcommand is not None:
71
+ return
72
+
73
+ # Validate source
74
+ if parse_artifact_id is None and file is None:
75
+ print_error("Provide either a PARSE_ARTIFACT_ID argument or --file.")
76
+ raise typer.Exit(1)
77
+ if parse_artifact_id is not None and file is not None:
78
+ print_error("PARSE_ARTIFACT_ID and --file are mutually exclusive.")
79
+ raise typer.Exit(1)
80
+
81
+ # Validate schema
82
+ if schema is None and schema_json is None and template_id is None:
83
+ print_error("Provide --schema, --schema-json, or --template.")
84
+ raise typer.Exit(1)
85
+ if template_id is not None and (schema is not None or schema_json is not None):
86
+ print_error("--template and --schema/--schema-json are mutually exclusive.")
87
+ raise typer.Exit(1)
88
+ if schema is not None and schema_json is not None:
89
+ print_error("--schema and --schema-json are mutually exclusive.")
90
+ raise typer.Exit(1)
91
+
92
+ # Resolve schema dict
93
+ resolved_schema: dict | None = None
94
+ if schema is not None:
95
+ if not schema.exists():
96
+ print_error(f"Schema file not found: {schema}")
97
+ raise typer.Exit(1)
98
+ try:
99
+ resolved_schema = json.loads(schema.read_text())
100
+ except json.JSONDecodeError as e:
101
+ print_error(f"Invalid JSON in schema file: {e}")
102
+ raise typer.Exit(1)
103
+ elif schema_json is not None:
104
+ try:
105
+ resolved_schema = json.loads(schema_json)
106
+ except json.JSONDecodeError as e:
107
+ print_error(f"Invalid --schema-json: {e}")
108
+ raise typer.Exit(1)
109
+
110
+ try:
111
+ client = get_client()
112
+
113
+ if parse_artifact_id is not None:
114
+ job = client.documents.extract(
115
+ parse_artifact_id=parse_artifact_id,
116
+ schema=resolved_schema,
117
+ template_id=template_id,
118
+ language=language,
119
+ )
120
+ else:
121
+ assert file is not None
122
+ if not file.exists():
123
+ print_error(f"File not found: {file}")
124
+ raise typer.Exit(1)
125
+ suffix = file.suffix.lower()
126
+ if suffix not in {".pdf", ".docx"}:
127
+ print_error(f"Unsupported file type: {suffix}. Accepted: .pdf, .docx")
128
+ raise typer.Exit(1)
129
+ with open(file, "rb") as f_obj:
130
+ job = client.documents.extract(
131
+ file=f_obj,
132
+ schema=resolved_schema,
133
+ template_id=template_id,
134
+ language=language,
135
+ )
136
+
137
+ if not wait:
138
+ if json_output:
139
+ print_json(job)
140
+ else:
141
+ console.print(f"[bold cyan]Job submitted:[/] [dim]{job.id}[/]")
142
+ console.print(f"Status: {job.status}")
143
+ return
144
+
145
+ # Poll until complete
146
+ import time
147
+
148
+ deadline = time.monotonic() + timeout
149
+ detail = client.documents.get_job(job.id)
150
+
151
+ while detail.status not in {"completed", "failed", "error"}:
152
+ if time.monotonic() >= deadline:
153
+ print_error(f"Timed out waiting for job {job.id}")
154
+ raise typer.Exit(1)
155
+ time.sleep(2)
156
+ detail = client.documents.get_job(job.id)
157
+
158
+ if detail.status in {"failed", "error"}:
159
+ print_error(f"Job {job.id} failed: {detail.error_message or 'unknown error'}")
160
+ raise typer.Exit(1)
161
+
162
+ # Find the extract_result artifact
163
+ result_artifact = next(
164
+ (a for a in detail.artifacts if a.artifact_kind == "extract_result"), None
165
+ )
166
+
167
+ if result_artifact is None:
168
+ print_error("No extract_result artifact found in completed job.")
169
+ raise typer.Exit(1)
170
+
171
+ # Fetch artifact content (JSON payload, not metadata)
172
+ content = client.documents.get_artifact_content(result_artifact.id)
173
+
174
+ if json_output:
175
+ json.dump(content, sys.stdout, indent=2, default=str)
176
+ sys.stdout.write("\n")
177
+ else:
178
+ console.print(f"[bold green]Extraction complete[/] — job [dim]{detail.id}[/]")
179
+ _print_fields(content)
180
+
181
+ except (
182
+ ApiError, AuthenticationError, NotFoundError,
183
+ RateLimitError, ValidationError, PollingTimeoutError,
184
+ ) as e:
185
+ print_error(str(e))
186
+ raise typer.Exit(1)
187
+
188
+
189
+ def _print_fields(content: dict) -> None:
190
+ """Pretty-print extracted fields from an extract_result artifact."""
191
+ from rich.table import Table
192
+
193
+ fields = content.get("fields", [])
194
+ if not fields:
195
+ console.print("[dim]No fields extracted.[/]")
196
+ return
197
+
198
+ table = Table(title="Extracted Fields")
199
+ table.add_column("Field", style="cyan")
200
+ table.add_column("Value", style="bold")
201
+ table.add_column("Confidence", justify="right", style="yellow")
202
+
203
+ for field in fields:
204
+ name = field.get("name", "")
205
+ value = field.get("value", "")
206
+ confidence = field.get("confidence")
207
+ conf_str = f"{confidence:.2f}" if isinstance(confidence, float) else str(confidence or "")
208
+ table.add_row(str(name), str(value) if value is not None else "[dim]null[/]", conf_str)
209
+
210
+ console.print(table)
211
+
212
+ errors = content.get("validation_errors", [])
213
+ if errors:
214
+ console.print(f"\n[yellow]Validation errors:[/] {len(errors)}")
215
+ for err in errors:
216
+ console.print(f" [red]•[/] {err}")
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import typer
6
+ from expunct import (
7
+ ApiError,
8
+ AuthenticationError,
9
+ NotFoundError,
10
+ PollingTimeoutError,
11
+ RateLimitError,
12
+ ValidationError,
13
+ )
14
+
15
+ from expunct_cli.client import get_client
16
+ from expunct_cli.output import console, print_error, print_json
17
+
18
+ app = typer.Typer(no_args_is_help=False, invoke_without_command=True)
19
+
20
+
21
+ @app.callback(invoke_without_command=True)
22
+ def parse(
23
+ ctx: typer.Context,
24
+ file: Path = typer.Argument(..., help="PDF or DOCX file to parse."),
25
+ language: str = typer.Option("en", "--language", "-l", help="Language code."),
26
+ wait: bool = typer.Option(True, "--wait/--no-wait", help="Wait for job to complete."),
27
+ timeout: int = typer.Option(300, "--timeout", help="Timeout in seconds when waiting."),
28
+ json_output: bool = typer.Option(False, "--json", help="Output raw JSON."),
29
+ ) -> None:
30
+ """Parse a PDF or DOCX into a canonical document structure.
31
+
32
+ Submits the file to the Document Intelligence API and (by default) waits
33
+ for the job to complete, then prints a summary of the produced artifacts.
34
+
35
+ Use --no-wait to return the job ID immediately for manual polling.
36
+ """
37
+ if ctx.invoked_subcommand is not None:
38
+ return
39
+
40
+ if not file.exists():
41
+ print_error(f"File not found: {file}")
42
+ raise typer.Exit(1)
43
+
44
+ suffix = file.suffix.lower()
45
+ if suffix not in {".pdf", ".docx"}:
46
+ print_error(f"Unsupported file type: {suffix}. Accepted: .pdf, .docx")
47
+ raise typer.Exit(1)
48
+
49
+ try:
50
+ client = get_client()
51
+
52
+ with open(file, "rb") as f:
53
+ job = client.documents.parse(f, language=language)
54
+
55
+ if json_output and not wait:
56
+ print_json(job)
57
+ return
58
+
59
+ if not wait:
60
+ if json_output:
61
+ print_json(job)
62
+ else:
63
+ console.print(f"[bold cyan]Job submitted:[/] [dim]{job.id}[/]")
64
+ console.print(f"Status: {job.status}")
65
+ console.print(
66
+ f"Poll with: [bold]expunct jobs get {job.id}[/] "
67
+ f"(or use expunct_get_document_job in MCP)"
68
+ )
69
+ return
70
+
71
+ # Wait for job to complete by polling
72
+ import time
73
+
74
+ deadline = time.monotonic() + timeout
75
+ detail = client.documents.get_job(job.id)
76
+
77
+ while detail.status not in {"completed", "failed", "error"}:
78
+ if time.monotonic() >= deadline:
79
+ print_error(f"Timed out waiting for job {job.id}")
80
+ raise typer.Exit(1)
81
+ time.sleep(2)
82
+ detail = client.documents.get_job(job.id)
83
+
84
+ if detail.status in {"failed", "error"}:
85
+ print_error(f"Job {job.id} failed: {detail.error_message or 'unknown error'}")
86
+ raise typer.Exit(1)
87
+
88
+ if json_output:
89
+ print_json(detail)
90
+ else:
91
+ console.print(f"[bold green]Parse complete[/] — job [dim]{detail.id}[/]")
92
+ console.print(f" Pages: {_pages(detail.artifacts)}")
93
+ console.print(f" Artifacts produced: {len(detail.artifacts)}")
94
+ for artifact in detail.artifacts:
95
+ console.print(
96
+ f" [cyan]{artifact.artifact_kind}[/] id=[dim]{artifact.id}[/]"
97
+ )
98
+ console.print(
99
+ "\nRetrieve artifact content with: "
100
+ "[bold]expunct_get_artifact[/] (MCP) or the Expunct SDK."
101
+ )
102
+
103
+ except (
104
+ ApiError, AuthenticationError, NotFoundError,
105
+ RateLimitError, ValidationError, PollingTimeoutError,
106
+ ) as e:
107
+ print_error(str(e))
108
+ raise typer.Exit(1)
109
+
110
+
111
+ def _pages(artifacts: list) -> str:
112
+ for a in artifacts:
113
+ if a.artifact_kind == "canonical_document" and a.page_count is not None:
114
+ return str(a.page_count)
115
+ return "unknown"
@@ -0,0 +1,118 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import typer
6
+ from expunct import (
7
+ ApiError,
8
+ AuthenticationError,
9
+ NotFoundError,
10
+ PollingTimeoutError,
11
+ RateLimitError,
12
+ ValidationError,
13
+ )
14
+
15
+ from expunct_cli.client import get_client
16
+ from expunct_cli.output import console, print_error, print_json
17
+
18
+ app = typer.Typer(no_args_is_help=False, invoke_without_command=True)
19
+
20
+
21
+ @app.callback(invoke_without_command=True)
22
+ def safe_parse(
23
+ ctx: typer.Context,
24
+ file: Path = typer.Argument(..., help="PDF or DOCX file to safe-parse."),
25
+ policy_id: str | None = typer.Option(
26
+ None, "--policy-id", "-p",
27
+ help="Redaction policy ID to apply during sanitization.",
28
+ ),
29
+ language: str = typer.Option("en", "--language", "-l", help="Language code."),
30
+ wait: bool = typer.Option(True, "--wait/--no-wait", help="Wait for job to complete."),
31
+ timeout: int = typer.Option(300, "--timeout", help="Timeout in seconds when waiting."),
32
+ json_output: bool = typer.Option(False, "--json", help="Output raw JSON."),
33
+ ) -> None:
34
+ """Parse a PDF or DOCX and sanitize PII in one workflow (beta).
35
+
36
+ Produces sanitized canonical document, sanitized markdown, and sanitized
37
+ chunk artifacts suitable for downstream AI ingestion (RAG, prompts, etc.).
38
+
39
+ Document Intelligence is currently in beta and gated by feature flag —
40
+ your tenant must be enabled before this command will succeed.
41
+ """
42
+ if ctx.invoked_subcommand is not None:
43
+ return
44
+
45
+ if not file.exists():
46
+ print_error(f"File not found: {file}")
47
+ raise typer.Exit(1)
48
+
49
+ suffix = file.suffix.lower()
50
+ if suffix not in {".pdf", ".docx"}:
51
+ print_error(f"Unsupported file type: {suffix}. Accepted: .pdf, .docx")
52
+ raise typer.Exit(1)
53
+
54
+ try:
55
+ client = get_client()
56
+
57
+ with open(file, "rb") as f:
58
+ job = client.documents.safe_parse(f, policy_id=policy_id, language=language)
59
+
60
+ if not wait:
61
+ if json_output:
62
+ print_json(job)
63
+ else:
64
+ console.print(f"[bold cyan]Job submitted:[/] [dim]{job.id}[/]")
65
+ console.print(f"Status: {job.status}")
66
+ console.print(
67
+ f"Poll with: [bold]expunct jobs get {job.id}[/] "
68
+ f"(or use expunct_get_document_job in MCP)"
69
+ )
70
+ return
71
+
72
+ import time
73
+
74
+ deadline = time.monotonic() + timeout
75
+ detail = client.documents.get_job(job.id)
76
+
77
+ while detail.status not in {"completed", "failed", "error"}:
78
+ if time.monotonic() >= deadline:
79
+ print_error(f"Timed out waiting for job {job.id}")
80
+ raise typer.Exit(1)
81
+ time.sleep(2)
82
+ detail = client.documents.get_job(job.id)
83
+
84
+ if detail.status in {"failed", "error"}:
85
+ print_error(f"Job {job.id} failed: {detail.error_message or 'unknown error'}")
86
+ raise typer.Exit(1)
87
+
88
+ if json_output:
89
+ print_json(detail)
90
+ else:
91
+ console.print(f"[bold green]Safe-parse complete[/] — job [dim]{detail.id}[/]")
92
+ console.print(f" Pages: {_pages(detail.artifacts)}")
93
+ console.print(f" Artifacts produced: {len(detail.artifacts)}")
94
+ for artifact in detail.artifacts:
95
+ console.print(
96
+ f" [cyan]{artifact.artifact_kind}[/] id=[dim]{artifact.id}[/]"
97
+ )
98
+ console.print(
99
+ "\nRetrieve sanitized artifact content with: "
100
+ "[bold]expunct_get_artifact[/] (MCP) or the Expunct SDK."
101
+ )
102
+
103
+ except (
104
+ ApiError, AuthenticationError, NotFoundError,
105
+ RateLimitError, ValidationError, PollingTimeoutError,
106
+ ) as e:
107
+ print_error(str(e))
108
+ raise typer.Exit(1)
109
+
110
+
111
+ def _pages(artifacts: list) -> str:
112
+ for a in artifacts:
113
+ if a.artifact_kind == "sanitized_canonical_document" and a.page_count is not None:
114
+ return str(a.page_count)
115
+ for a in artifacts:
116
+ if a.artifact_kind == "canonical_document" and a.page_count is not None:
117
+ return str(a.page_count)
118
+ return "unknown"
@@ -6,9 +6,12 @@ from expunct_cli import __version__
6
6
  from expunct_cli.commands.audit import app as audit_app
7
7
  from expunct_cli.commands.config_cmd import app as config_app
8
8
  from expunct_cli.commands.detect import app as detect_app
9
+ from expunct_cli.commands.extract import app as extract_app
9
10
  from expunct_cli.commands.jobs import app as jobs_app
11
+ from expunct_cli.commands.parse import app as parse_app
10
12
  from expunct_cli.commands.policies import app as policies_app
11
13
  from expunct_cli.commands.redact import app as redact_app
14
+ from expunct_cli.commands.safe_parse import app as safe_parse_app
12
15
 
13
16
  app = typer.Typer(
14
17
  name="expunct",
@@ -41,6 +44,21 @@ app.add_typer(jobs_app, name="jobs", help="Manage redaction jobs.")
41
44
  app.add_typer(policies_app, name="policies", help="Manage redaction policies.")
42
45
  app.add_typer(audit_app, name="audit", help="View audit logs.")
43
46
  app.add_typer(config_app, name="config", help="Manage CLI configuration.")
47
+ app.add_typer(
48
+ parse_app,
49
+ name="parse",
50
+ help="Parse a PDF or DOCX into structured document artifacts.",
51
+ )
52
+ app.add_typer(
53
+ extract_app,
54
+ name="extract",
55
+ help="Extract structured fields from a parsed document.",
56
+ )
57
+ app.add_typer(
58
+ safe_parse_app,
59
+ name="safe-parse",
60
+ help="Parse a PDF or DOCX and sanitize PII in one workflow (beta).",
61
+ )
44
62
 
45
63
  if __name__ == "__main__":
46
64
  app()
@@ -1 +0,0 @@
1
- __version__ = "0.1.0"
File without changes