expunct-cli 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/PKG-INFO +77 -3
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/README.md +73 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/pyproject.toml +4 -3
- expunct_cli-0.2.0/src/expunct_cli/__init__.py +1 -0
- expunct_cli-0.2.0/src/expunct_cli/commands/extract.py +216 -0
- expunct_cli-0.2.0/src/expunct_cli/commands/parse.py +115 -0
- expunct_cli-0.2.0/src/expunct_cli/commands/safe_parse.py +118 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/main.py +18 -0
- expunct_cli-0.1.0/src/expunct_cli/__init__.py +0 -1
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/.github/workflows/ci.yml +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/.github/workflows/publish.yml +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/.gitignore +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/client.py +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/__init__.py +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/audit.py +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/config_cmd.py +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/detect.py +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/jobs.py +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/policies.py +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/commands/redact.py +0 -0
- {expunct_cli-0.1.0 → expunct_cli-0.2.0}/src/expunct_cli/output.py +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: expunct-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: CLI for the Expunct PII redaction API — redact, detect, and manage sensitive data from the command line.
|
|
5
5
|
Project-URL: Homepage, https://expunct.ai
|
|
6
6
|
Project-URL: Documentation, https://docs.expunct.ai
|
|
7
7
|
Project-URL: Repository, https://github.com/expunct/cli
|
|
8
8
|
Author: Expunct
|
|
9
9
|
License-Expression: MIT
|
|
10
|
-
Keywords: cli,pii,privacy,redaction,security
|
|
10
|
+
Keywords: ai,cli,gdpr,hipaa,llm,pdf,pii,privacy,rag,redaction,security
|
|
11
11
|
Classifier: Development Status :: 4 - Beta
|
|
12
12
|
Classifier: Environment :: Console
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
@@ -18,10 +18,11 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
22
|
Classifier: Topic :: Security
|
|
22
23
|
Classifier: Topic :: Software Development :: Libraries
|
|
23
24
|
Requires-Python: >=3.10
|
|
24
|
-
Requires-Dist: expunct>=0.
|
|
25
|
+
Requires-Dist: expunct>=0.2.0
|
|
25
26
|
Requires-Dist: rich>=13.0
|
|
26
27
|
Requires-Dist: typer>=0.15
|
|
27
28
|
Provides-Extra: dev
|
|
@@ -212,6 +213,79 @@ expunct audit list --event-type redaction --page-size 50
|
|
|
212
213
|
expunct audit list --json
|
|
213
214
|
```
|
|
214
215
|
|
|
216
|
+
### Document Intelligence commands (beta)
|
|
217
|
+
|
|
218
|
+
`parse`, `extract`, and `safe-parse` call the Expunct Document Intelligence API,
|
|
219
|
+
which is currently in **beta and feature-flag gated**. Your tenant must be
|
|
220
|
+
enabled before these commands will succeed; calls from disabled tenants return a
|
|
221
|
+
`403`. PDF and DOCX are the supported formats during beta.
|
|
222
|
+
|
|
223
|
+
### `expunct parse`
|
|
224
|
+
|
|
225
|
+
Parse a PDF or DOCX into a canonical document structure (text, tables, headings, layout).
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
expunct parse report.pdf
|
|
229
|
+
expunct parse contract.docx --language en
|
|
230
|
+
expunct parse report.pdf --no-wait # submit and return job ID immediately
|
|
231
|
+
expunct parse report.pdf --json # raw JSON output
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
| Flag | Description |
|
|
235
|
+
|------|-------------|
|
|
236
|
+
| `--language, -l` | Language code (default: `en`) |
|
|
237
|
+
| `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
|
|
238
|
+
| `--timeout` | Wait timeout in seconds (default: `300`) |
|
|
239
|
+
| `--json` | Raw JSON output |
|
|
240
|
+
|
|
241
|
+
### `expunct extract`
|
|
242
|
+
|
|
243
|
+
Extract structured fields from a parsed document using a JSON Schema or built-in template.
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
# From an existing parse artifact (preferred — avoids re-parsing)
|
|
247
|
+
expunct extract art-abc123 --schema invoice_schema.json
|
|
248
|
+
|
|
249
|
+
# Parse and extract in one step
|
|
250
|
+
expunct extract --file report.pdf --template invoice
|
|
251
|
+
|
|
252
|
+
# Inline schema
|
|
253
|
+
expunct extract art-abc123 --schema-json '{"type":"object","properties":{"total":{"type":"number"}}}'
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
| Flag | Description |
|
|
257
|
+
|------|-------------|
|
|
258
|
+
| `PARSE_ARTIFACT_ID` | Canonical document artifact ID from a completed parse job |
|
|
259
|
+
| `--file, -f` | PDF or DOCX to parse and extract in one step (mutually exclusive with artifact ID) |
|
|
260
|
+
| `--schema, -s` | Path to JSON Schema file defining fields to extract |
|
|
261
|
+
| `--schema-json` | Inline JSON Schema string |
|
|
262
|
+
| `--template, -t` | Built-in template ID (e.g. `invoice`) |
|
|
263
|
+
| `--language, -l` | Language code (default: `en`) |
|
|
264
|
+
| `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
|
|
265
|
+
| `--timeout` | Wait timeout in seconds (default: `300`) |
|
|
266
|
+
| `--json` | Raw JSON output |
|
|
267
|
+
|
|
268
|
+
### `expunct safe-parse`
|
|
269
|
+
|
|
270
|
+
Parse a PDF or DOCX and sanitize PII in a single workflow. Produces sanitized
|
|
271
|
+
canonical document, sanitized markdown, and sanitized chunk artifacts suitable
|
|
272
|
+
for AI ingestion (RAG, prompts, embeddings).
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
expunct safe-parse contract.pdf
|
|
276
|
+
expunct safe-parse report.docx --policy-id strict
|
|
277
|
+
expunct safe-parse contract.pdf --no-wait # submit and return job ID immediately
|
|
278
|
+
expunct safe-parse contract.pdf --json # raw JSON output
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
| Flag | Description |
|
|
282
|
+
|------|-------------|
|
|
283
|
+
| `--policy-id, -p` | Redaction policy ID applied during sanitization |
|
|
284
|
+
| `--language, -l` | Language code (default: `en`) |
|
|
285
|
+
| `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
|
|
286
|
+
| `--timeout` | Wait timeout in seconds (default: `300`) |
|
|
287
|
+
| `--json` | Raw JSON output |
|
|
288
|
+
|
|
215
289
|
### `expunct config`
|
|
216
290
|
|
|
217
291
|
Manage CLI configuration.
|
|
@@ -181,6 +181,79 @@ expunct audit list --event-type redaction --page-size 50
|
|
|
181
181
|
expunct audit list --json
|
|
182
182
|
```
|
|
183
183
|
|
|
184
|
+
### Document Intelligence commands (beta)
|
|
185
|
+
|
|
186
|
+
`parse`, `extract`, and `safe-parse` call the Expunct Document Intelligence API,
|
|
187
|
+
which is currently in **beta and feature-flag gated**. Your tenant must be
|
|
188
|
+
enabled before these commands will succeed; calls from disabled tenants return a
|
|
189
|
+
`403`. PDF and DOCX are the supported formats during beta.
|
|
190
|
+
|
|
191
|
+
### `expunct parse`
|
|
192
|
+
|
|
193
|
+
Parse a PDF or DOCX into a canonical document structure (text, tables, headings, layout).
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
expunct parse report.pdf
|
|
197
|
+
expunct parse contract.docx --language en
|
|
198
|
+
expunct parse report.pdf --no-wait # submit and return job ID immediately
|
|
199
|
+
expunct parse report.pdf --json # raw JSON output
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
| Flag | Description |
|
|
203
|
+
|------|-------------|
|
|
204
|
+
| `--language, -l` | Language code (default: `en`) |
|
|
205
|
+
| `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
|
|
206
|
+
| `--timeout` | Wait timeout in seconds (default: `300`) |
|
|
207
|
+
| `--json` | Raw JSON output |
|
|
208
|
+
|
|
209
|
+
### `expunct extract`
|
|
210
|
+
|
|
211
|
+
Extract structured fields from a parsed document using a JSON Schema or built-in template.
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
# From an existing parse artifact (preferred — avoids re-parsing)
|
|
215
|
+
expunct extract art-abc123 --schema invoice_schema.json
|
|
216
|
+
|
|
217
|
+
# Parse and extract in one step
|
|
218
|
+
expunct extract --file report.pdf --template invoice
|
|
219
|
+
|
|
220
|
+
# Inline schema
|
|
221
|
+
expunct extract art-abc123 --schema-json '{"type":"object","properties":{"total":{"type":"number"}}}'
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
| Flag | Description |
|
|
225
|
+
|------|-------------|
|
|
226
|
+
| `PARSE_ARTIFACT_ID` | Canonical document artifact ID from a completed parse job |
|
|
227
|
+
| `--file, -f` | PDF or DOCX to parse and extract in one step (mutually exclusive with artifact ID) |
|
|
228
|
+
| `--schema, -s` | Path to JSON Schema file defining fields to extract |
|
|
229
|
+
| `--schema-json` | Inline JSON Schema string |
|
|
230
|
+
| `--template, -t` | Built-in template ID (e.g. `invoice`) |
|
|
231
|
+
| `--language, -l` | Language code (default: `en`) |
|
|
232
|
+
| `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
|
|
233
|
+
| `--timeout` | Wait timeout in seconds (default: `300`) |
|
|
234
|
+
| `--json` | Raw JSON output |
|
|
235
|
+
|
|
236
|
+
### `expunct safe-parse`
|
|
237
|
+
|
|
238
|
+
Parse a PDF or DOCX and sanitize PII in a single workflow. Produces sanitized
|
|
239
|
+
canonical document, sanitized markdown, and sanitized chunk artifacts suitable
|
|
240
|
+
for AI ingestion (RAG, prompts, embeddings).
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
expunct safe-parse contract.pdf
|
|
244
|
+
expunct safe-parse report.docx --policy-id strict
|
|
245
|
+
expunct safe-parse contract.pdf --no-wait # submit and return job ID immediately
|
|
246
|
+
expunct safe-parse contract.pdf --json # raw JSON output
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
| Flag | Description |
|
|
250
|
+
|------|-------------|
|
|
251
|
+
| `--policy-id, -p` | Redaction policy ID applied during sanitization |
|
|
252
|
+
| `--language, -l` | Language code (default: `en`) |
|
|
253
|
+
| `--wait/--no-wait` | Wait for job to complete (default: `--wait`) |
|
|
254
|
+
| `--timeout` | Wait timeout in seconds (default: `300`) |
|
|
255
|
+
| `--json` | Raw JSON output |
|
|
256
|
+
|
|
184
257
|
### `expunct config`
|
|
185
258
|
|
|
186
259
|
Manage CLI configuration.
|
|
@@ -4,13 +4,13 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "expunct-cli"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "CLI for the Expunct PII redaction API — redact, detect, and manage sensitive data from the command line."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
11
11
|
requires-python = ">=3.10"
|
|
12
12
|
authors = [{ name = "Expunct" }]
|
|
13
|
-
keywords = ["pii", "redaction", "privacy", "cli", "security"]
|
|
13
|
+
keywords = ["pii", "redaction", "privacy", "cli", "security", "gdpr", "hipaa", "llm", "rag", "ai", "pdf"]
|
|
14
14
|
classifiers = [
|
|
15
15
|
"Development Status :: 4 - Beta",
|
|
16
16
|
"Environment :: Console",
|
|
@@ -23,10 +23,11 @@ classifiers = [
|
|
|
23
23
|
"Programming Language :: Python :: 3.12",
|
|
24
24
|
"Programming Language :: Python :: 3.13",
|
|
25
25
|
"Topic :: Security",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
26
27
|
"Topic :: Software Development :: Libraries",
|
|
27
28
|
]
|
|
28
29
|
dependencies = [
|
|
29
|
-
"expunct>=0.
|
|
30
|
+
"expunct>=0.2.0",
|
|
30
31
|
"typer>=0.15",
|
|
31
32
|
"rich>=13.0",
|
|
32
33
|
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
from expunct import (
|
|
9
|
+
ApiError,
|
|
10
|
+
AuthenticationError,
|
|
11
|
+
NotFoundError,
|
|
12
|
+
PollingTimeoutError,
|
|
13
|
+
RateLimitError,
|
|
14
|
+
ValidationError,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from expunct_cli.client import get_client
|
|
18
|
+
from expunct_cli.output import console, print_error, print_json
|
|
19
|
+
|
|
20
|
+
app = typer.Typer(no_args_is_help=False, invoke_without_command=True)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@app.callback(invoke_without_command=True)
|
|
24
|
+
def extract(
|
|
25
|
+
ctx: typer.Context,
|
|
26
|
+
parse_artifact_id: str | None = typer.Argument(
|
|
27
|
+
None,
|
|
28
|
+
help="Canonical document artifact ID from a completed parse job. "
|
|
29
|
+
"Mutually exclusive with --file.",
|
|
30
|
+
),
|
|
31
|
+
file: Path | None = typer.Option(
|
|
32
|
+
None, "--file", "-f",
|
|
33
|
+
help="PDF or DOCX file to parse and extract in one step. "
|
|
34
|
+
"Mutually exclusive with PARSE_ARTIFACT_ID.",
|
|
35
|
+
),
|
|
36
|
+
schema: Path | None = typer.Option(
|
|
37
|
+
None, "--schema", "-s",
|
|
38
|
+
help="Path to a JSON Schema file defining the fields to extract.",
|
|
39
|
+
),
|
|
40
|
+
schema_json: str | None = typer.Option(
|
|
41
|
+
None, "--schema-json",
|
|
42
|
+
help="Inline JSON Schema string (alternative to --schema).",
|
|
43
|
+
),
|
|
44
|
+
template_id: str | None = typer.Option(
|
|
45
|
+
None, "--template", "-t",
|
|
46
|
+
help="Built-in extraction template ID (e.g. 'invoice'). "
|
|
47
|
+
"Mutually exclusive with --schema / --schema-json.",
|
|
48
|
+
),
|
|
49
|
+
language: str = typer.Option("en", "--language", "-l", help="Language code."),
|
|
50
|
+
wait: bool = typer.Option(True, "--wait/--no-wait", help="Wait for job to complete."),
|
|
51
|
+
timeout: int = typer.Option(300, "--timeout", help="Timeout in seconds when waiting."),
|
|
52
|
+
json_output: bool = typer.Option(False, "--json", help="Output raw JSON."),
|
|
53
|
+
) -> None:
|
|
54
|
+
"""Extract structured fields from a parsed document.
|
|
55
|
+
|
|
56
|
+
Provide exactly one source: a PARSE_ARTIFACT_ID (canonical document artifact
|
|
57
|
+
from a completed parse job) or --file (parses and extracts in one step).
|
|
58
|
+
|
|
59
|
+
Provide exactly one schema: --schema / --schema-json or --template.
|
|
60
|
+
|
|
61
|
+
Examples:
|
|
62
|
+
|
|
63
|
+
expunct extract art-abc123 --schema invoice_schema.json
|
|
64
|
+
|
|
65
|
+
expunct extract --file report.pdf --template invoice
|
|
66
|
+
|
|
67
|
+
expunct extract art-abc123 \\
|
|
68
|
+
--schema-json '{"type":"object","properties":{"total":{"type":"number"}}}'
|
|
69
|
+
"""
|
|
70
|
+
if ctx.invoked_subcommand is not None:
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
# Validate source
|
|
74
|
+
if parse_artifact_id is None and file is None:
|
|
75
|
+
print_error("Provide either a PARSE_ARTIFACT_ID argument or --file.")
|
|
76
|
+
raise typer.Exit(1)
|
|
77
|
+
if parse_artifact_id is not None and file is not None:
|
|
78
|
+
print_error("PARSE_ARTIFACT_ID and --file are mutually exclusive.")
|
|
79
|
+
raise typer.Exit(1)
|
|
80
|
+
|
|
81
|
+
# Validate schema
|
|
82
|
+
if schema is None and schema_json is None and template_id is None:
|
|
83
|
+
print_error("Provide --schema, --schema-json, or --template.")
|
|
84
|
+
raise typer.Exit(1)
|
|
85
|
+
if template_id is not None and (schema is not None or schema_json is not None):
|
|
86
|
+
print_error("--template and --schema/--schema-json are mutually exclusive.")
|
|
87
|
+
raise typer.Exit(1)
|
|
88
|
+
if schema is not None and schema_json is not None:
|
|
89
|
+
print_error("--schema and --schema-json are mutually exclusive.")
|
|
90
|
+
raise typer.Exit(1)
|
|
91
|
+
|
|
92
|
+
# Resolve schema dict
|
|
93
|
+
resolved_schema: dict | None = None
|
|
94
|
+
if schema is not None:
|
|
95
|
+
if not schema.exists():
|
|
96
|
+
print_error(f"Schema file not found: {schema}")
|
|
97
|
+
raise typer.Exit(1)
|
|
98
|
+
try:
|
|
99
|
+
resolved_schema = json.loads(schema.read_text())
|
|
100
|
+
except json.JSONDecodeError as e:
|
|
101
|
+
print_error(f"Invalid JSON in schema file: {e}")
|
|
102
|
+
raise typer.Exit(1)
|
|
103
|
+
elif schema_json is not None:
|
|
104
|
+
try:
|
|
105
|
+
resolved_schema = json.loads(schema_json)
|
|
106
|
+
except json.JSONDecodeError as e:
|
|
107
|
+
print_error(f"Invalid --schema-json: {e}")
|
|
108
|
+
raise typer.Exit(1)
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
client = get_client()
|
|
112
|
+
|
|
113
|
+
if parse_artifact_id is not None:
|
|
114
|
+
job = client.documents.extract(
|
|
115
|
+
parse_artifact_id=parse_artifact_id,
|
|
116
|
+
schema=resolved_schema,
|
|
117
|
+
template_id=template_id,
|
|
118
|
+
language=language,
|
|
119
|
+
)
|
|
120
|
+
else:
|
|
121
|
+
assert file is not None
|
|
122
|
+
if not file.exists():
|
|
123
|
+
print_error(f"File not found: {file}")
|
|
124
|
+
raise typer.Exit(1)
|
|
125
|
+
suffix = file.suffix.lower()
|
|
126
|
+
if suffix not in {".pdf", ".docx"}:
|
|
127
|
+
print_error(f"Unsupported file type: {suffix}. Accepted: .pdf, .docx")
|
|
128
|
+
raise typer.Exit(1)
|
|
129
|
+
with open(file, "rb") as f_obj:
|
|
130
|
+
job = client.documents.extract(
|
|
131
|
+
file=f_obj,
|
|
132
|
+
schema=resolved_schema,
|
|
133
|
+
template_id=template_id,
|
|
134
|
+
language=language,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
if not wait:
|
|
138
|
+
if json_output:
|
|
139
|
+
print_json(job)
|
|
140
|
+
else:
|
|
141
|
+
console.print(f"[bold cyan]Job submitted:[/] [dim]{job.id}[/]")
|
|
142
|
+
console.print(f"Status: {job.status}")
|
|
143
|
+
return
|
|
144
|
+
|
|
145
|
+
# Poll until complete
|
|
146
|
+
import time
|
|
147
|
+
|
|
148
|
+
deadline = time.monotonic() + timeout
|
|
149
|
+
detail = client.documents.get_job(job.id)
|
|
150
|
+
|
|
151
|
+
while detail.status not in {"completed", "failed", "error"}:
|
|
152
|
+
if time.monotonic() >= deadline:
|
|
153
|
+
print_error(f"Timed out waiting for job {job.id}")
|
|
154
|
+
raise typer.Exit(1)
|
|
155
|
+
time.sleep(2)
|
|
156
|
+
detail = client.documents.get_job(job.id)
|
|
157
|
+
|
|
158
|
+
if detail.status in {"failed", "error"}:
|
|
159
|
+
print_error(f"Job {job.id} failed: {detail.error_message or 'unknown error'}")
|
|
160
|
+
raise typer.Exit(1)
|
|
161
|
+
|
|
162
|
+
# Find the extract_result artifact
|
|
163
|
+
result_artifact = next(
|
|
164
|
+
(a for a in detail.artifacts if a.artifact_kind == "extract_result"), None
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if result_artifact is None:
|
|
168
|
+
print_error("No extract_result artifact found in completed job.")
|
|
169
|
+
raise typer.Exit(1)
|
|
170
|
+
|
|
171
|
+
# Fetch artifact content (JSON payload, not metadata)
|
|
172
|
+
content = client.documents.get_artifact_content(result_artifact.id)
|
|
173
|
+
|
|
174
|
+
if json_output:
|
|
175
|
+
json.dump(content, sys.stdout, indent=2, default=str)
|
|
176
|
+
sys.stdout.write("\n")
|
|
177
|
+
else:
|
|
178
|
+
console.print(f"[bold green]Extraction complete[/] — job [dim]{detail.id}[/]")
|
|
179
|
+
_print_fields(content)
|
|
180
|
+
|
|
181
|
+
except (
|
|
182
|
+
ApiError, AuthenticationError, NotFoundError,
|
|
183
|
+
RateLimitError, ValidationError, PollingTimeoutError,
|
|
184
|
+
) as e:
|
|
185
|
+
print_error(str(e))
|
|
186
|
+
raise typer.Exit(1)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _print_fields(content: dict) -> None:
|
|
190
|
+
"""Pretty-print extracted fields from an extract_result artifact."""
|
|
191
|
+
from rich.table import Table
|
|
192
|
+
|
|
193
|
+
fields = content.get("fields", [])
|
|
194
|
+
if not fields:
|
|
195
|
+
console.print("[dim]No fields extracted.[/]")
|
|
196
|
+
return
|
|
197
|
+
|
|
198
|
+
table = Table(title="Extracted Fields")
|
|
199
|
+
table.add_column("Field", style="cyan")
|
|
200
|
+
table.add_column("Value", style="bold")
|
|
201
|
+
table.add_column("Confidence", justify="right", style="yellow")
|
|
202
|
+
|
|
203
|
+
for field in fields:
|
|
204
|
+
name = field.get("name", "")
|
|
205
|
+
value = field.get("value", "")
|
|
206
|
+
confidence = field.get("confidence")
|
|
207
|
+
conf_str = f"{confidence:.2f}" if isinstance(confidence, float) else str(confidence or "")
|
|
208
|
+
table.add_row(str(name), str(value) if value is not None else "[dim]null[/]", conf_str)
|
|
209
|
+
|
|
210
|
+
console.print(table)
|
|
211
|
+
|
|
212
|
+
errors = content.get("validation_errors", [])
|
|
213
|
+
if errors:
|
|
214
|
+
console.print(f"\n[yellow]Validation errors:[/] {len(errors)}")
|
|
215
|
+
for err in errors:
|
|
216
|
+
console.print(f" [red]•[/] {err}")
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
from expunct import (
|
|
7
|
+
ApiError,
|
|
8
|
+
AuthenticationError,
|
|
9
|
+
NotFoundError,
|
|
10
|
+
PollingTimeoutError,
|
|
11
|
+
RateLimitError,
|
|
12
|
+
ValidationError,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from expunct_cli.client import get_client
|
|
16
|
+
from expunct_cli.output import console, print_error, print_json
|
|
17
|
+
|
|
18
|
+
app = typer.Typer(no_args_is_help=False, invoke_without_command=True)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@app.callback(invoke_without_command=True)
|
|
22
|
+
def parse(
|
|
23
|
+
ctx: typer.Context,
|
|
24
|
+
file: Path = typer.Argument(..., help="PDF or DOCX file to parse."),
|
|
25
|
+
language: str = typer.Option("en", "--language", "-l", help="Language code."),
|
|
26
|
+
wait: bool = typer.Option(True, "--wait/--no-wait", help="Wait for job to complete."),
|
|
27
|
+
timeout: int = typer.Option(300, "--timeout", help="Timeout in seconds when waiting."),
|
|
28
|
+
json_output: bool = typer.Option(False, "--json", help="Output raw JSON."),
|
|
29
|
+
) -> None:
|
|
30
|
+
"""Parse a PDF or DOCX into a canonical document structure.
|
|
31
|
+
|
|
32
|
+
Submits the file to the Document Intelligence API and (by default) waits
|
|
33
|
+
for the job to complete, then prints a summary of the produced artifacts.
|
|
34
|
+
|
|
35
|
+
Use --no-wait to return the job ID immediately for manual polling.
|
|
36
|
+
"""
|
|
37
|
+
if ctx.invoked_subcommand is not None:
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
if not file.exists():
|
|
41
|
+
print_error(f"File not found: {file}")
|
|
42
|
+
raise typer.Exit(1)
|
|
43
|
+
|
|
44
|
+
suffix = file.suffix.lower()
|
|
45
|
+
if suffix not in {".pdf", ".docx"}:
|
|
46
|
+
print_error(f"Unsupported file type: {suffix}. Accepted: .pdf, .docx")
|
|
47
|
+
raise typer.Exit(1)
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
client = get_client()
|
|
51
|
+
|
|
52
|
+
with open(file, "rb") as f:
|
|
53
|
+
job = client.documents.parse(f, language=language)
|
|
54
|
+
|
|
55
|
+
if json_output and not wait:
|
|
56
|
+
print_json(job)
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
if not wait:
|
|
60
|
+
if json_output:
|
|
61
|
+
print_json(job)
|
|
62
|
+
else:
|
|
63
|
+
console.print(f"[bold cyan]Job submitted:[/] [dim]{job.id}[/]")
|
|
64
|
+
console.print(f"Status: {job.status}")
|
|
65
|
+
console.print(
|
|
66
|
+
f"Poll with: [bold]expunct jobs get {job.id}[/] "
|
|
67
|
+
f"(or use expunct_get_document_job in MCP)"
|
|
68
|
+
)
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
# Wait for job to complete by polling
|
|
72
|
+
import time
|
|
73
|
+
|
|
74
|
+
deadline = time.monotonic() + timeout
|
|
75
|
+
detail = client.documents.get_job(job.id)
|
|
76
|
+
|
|
77
|
+
while detail.status not in {"completed", "failed", "error"}:
|
|
78
|
+
if time.monotonic() >= deadline:
|
|
79
|
+
print_error(f"Timed out waiting for job {job.id}")
|
|
80
|
+
raise typer.Exit(1)
|
|
81
|
+
time.sleep(2)
|
|
82
|
+
detail = client.documents.get_job(job.id)
|
|
83
|
+
|
|
84
|
+
if detail.status in {"failed", "error"}:
|
|
85
|
+
print_error(f"Job {job.id} failed: {detail.error_message or 'unknown error'}")
|
|
86
|
+
raise typer.Exit(1)
|
|
87
|
+
|
|
88
|
+
if json_output:
|
|
89
|
+
print_json(detail)
|
|
90
|
+
else:
|
|
91
|
+
console.print(f"[bold green]Parse complete[/] — job [dim]{detail.id}[/]")
|
|
92
|
+
console.print(f" Pages: {_pages(detail.artifacts)}")
|
|
93
|
+
console.print(f" Artifacts produced: {len(detail.artifacts)}")
|
|
94
|
+
for artifact in detail.artifacts:
|
|
95
|
+
console.print(
|
|
96
|
+
f" [cyan]{artifact.artifact_kind}[/] id=[dim]{artifact.id}[/]"
|
|
97
|
+
)
|
|
98
|
+
console.print(
|
|
99
|
+
"\nRetrieve artifact content with: "
|
|
100
|
+
"[bold]expunct_get_artifact[/] (MCP) or the Expunct SDK."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
except (
|
|
104
|
+
ApiError, AuthenticationError, NotFoundError,
|
|
105
|
+
RateLimitError, ValidationError, PollingTimeoutError,
|
|
106
|
+
) as e:
|
|
107
|
+
print_error(str(e))
|
|
108
|
+
raise typer.Exit(1)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _pages(artifacts: list) -> str:
|
|
112
|
+
for a in artifacts:
|
|
113
|
+
if a.artifact_kind == "canonical_document" and a.page_count is not None:
|
|
114
|
+
return str(a.page_count)
|
|
115
|
+
return "unknown"
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
from expunct import (
|
|
7
|
+
ApiError,
|
|
8
|
+
AuthenticationError,
|
|
9
|
+
NotFoundError,
|
|
10
|
+
PollingTimeoutError,
|
|
11
|
+
RateLimitError,
|
|
12
|
+
ValidationError,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from expunct_cli.client import get_client
|
|
16
|
+
from expunct_cli.output import console, print_error, print_json
|
|
17
|
+
|
|
18
|
+
app = typer.Typer(no_args_is_help=False, invoke_without_command=True)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@app.callback(invoke_without_command=True)
|
|
22
|
+
def safe_parse(
|
|
23
|
+
ctx: typer.Context,
|
|
24
|
+
file: Path = typer.Argument(..., help="PDF or DOCX file to safe-parse."),
|
|
25
|
+
policy_id: str | None = typer.Option(
|
|
26
|
+
None, "--policy-id", "-p",
|
|
27
|
+
help="Redaction policy ID to apply during sanitization.",
|
|
28
|
+
),
|
|
29
|
+
language: str = typer.Option("en", "--language", "-l", help="Language code."),
|
|
30
|
+
wait: bool = typer.Option(True, "--wait/--no-wait", help="Wait for job to complete."),
|
|
31
|
+
timeout: int = typer.Option(300, "--timeout", help="Timeout in seconds when waiting."),
|
|
32
|
+
json_output: bool = typer.Option(False, "--json", help="Output raw JSON."),
|
|
33
|
+
) -> None:
|
|
34
|
+
"""Parse a PDF or DOCX and sanitize PII in one workflow (beta).
|
|
35
|
+
|
|
36
|
+
Produces sanitized canonical document, sanitized markdown, and sanitized
|
|
37
|
+
chunk artifacts suitable for downstream AI ingestion (RAG, prompts, etc.).
|
|
38
|
+
|
|
39
|
+
Document Intelligence is currently in beta and gated by feature flag —
|
|
40
|
+
your tenant must be enabled before this command will succeed.
|
|
41
|
+
"""
|
|
42
|
+
if ctx.invoked_subcommand is not None:
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
if not file.exists():
|
|
46
|
+
print_error(f"File not found: {file}")
|
|
47
|
+
raise typer.Exit(1)
|
|
48
|
+
|
|
49
|
+
suffix = file.suffix.lower()
|
|
50
|
+
if suffix not in {".pdf", ".docx"}:
|
|
51
|
+
print_error(f"Unsupported file type: {suffix}. Accepted: .pdf, .docx")
|
|
52
|
+
raise typer.Exit(1)
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
client = get_client()
|
|
56
|
+
|
|
57
|
+
with open(file, "rb") as f:
|
|
58
|
+
job = client.documents.safe_parse(f, policy_id=policy_id, language=language)
|
|
59
|
+
|
|
60
|
+
if not wait:
|
|
61
|
+
if json_output:
|
|
62
|
+
print_json(job)
|
|
63
|
+
else:
|
|
64
|
+
console.print(f"[bold cyan]Job submitted:[/] [dim]{job.id}[/]")
|
|
65
|
+
console.print(f"Status: {job.status}")
|
|
66
|
+
console.print(
|
|
67
|
+
f"Poll with: [bold]expunct jobs get {job.id}[/] "
|
|
68
|
+
f"(or use expunct_get_document_job in MCP)"
|
|
69
|
+
)
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
import time
|
|
73
|
+
|
|
74
|
+
deadline = time.monotonic() + timeout
|
|
75
|
+
detail = client.documents.get_job(job.id)
|
|
76
|
+
|
|
77
|
+
while detail.status not in {"completed", "failed", "error"}:
|
|
78
|
+
if time.monotonic() >= deadline:
|
|
79
|
+
print_error(f"Timed out waiting for job {job.id}")
|
|
80
|
+
raise typer.Exit(1)
|
|
81
|
+
time.sleep(2)
|
|
82
|
+
detail = client.documents.get_job(job.id)
|
|
83
|
+
|
|
84
|
+
if detail.status in {"failed", "error"}:
|
|
85
|
+
print_error(f"Job {job.id} failed: {detail.error_message or 'unknown error'}")
|
|
86
|
+
raise typer.Exit(1)
|
|
87
|
+
|
|
88
|
+
if json_output:
|
|
89
|
+
print_json(detail)
|
|
90
|
+
else:
|
|
91
|
+
console.print(f"[bold green]Safe-parse complete[/] — job [dim]{detail.id}[/]")
|
|
92
|
+
console.print(f" Pages: {_pages(detail.artifacts)}")
|
|
93
|
+
console.print(f" Artifacts produced: {len(detail.artifacts)}")
|
|
94
|
+
for artifact in detail.artifacts:
|
|
95
|
+
console.print(
|
|
96
|
+
f" [cyan]{artifact.artifact_kind}[/] id=[dim]{artifact.id}[/]"
|
|
97
|
+
)
|
|
98
|
+
console.print(
|
|
99
|
+
"\nRetrieve sanitized artifact content with: "
|
|
100
|
+
"[bold]expunct_get_artifact[/] (MCP) or the Expunct SDK."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
except (
|
|
104
|
+
ApiError, AuthenticationError, NotFoundError,
|
|
105
|
+
RateLimitError, ValidationError, PollingTimeoutError,
|
|
106
|
+
) as e:
|
|
107
|
+
print_error(str(e))
|
|
108
|
+
raise typer.Exit(1)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _pages(artifacts: list) -> str:
|
|
112
|
+
for a in artifacts:
|
|
113
|
+
if a.artifact_kind == "sanitized_canonical_document" and a.page_count is not None:
|
|
114
|
+
return str(a.page_count)
|
|
115
|
+
for a in artifacts:
|
|
116
|
+
if a.artifact_kind == "canonical_document" and a.page_count is not None:
|
|
117
|
+
return str(a.page_count)
|
|
118
|
+
return "unknown"
|
|
@@ -6,9 +6,12 @@ from expunct_cli import __version__
|
|
|
6
6
|
from expunct_cli.commands.audit import app as audit_app
|
|
7
7
|
from expunct_cli.commands.config_cmd import app as config_app
|
|
8
8
|
from expunct_cli.commands.detect import app as detect_app
|
|
9
|
+
from expunct_cli.commands.extract import app as extract_app
|
|
9
10
|
from expunct_cli.commands.jobs import app as jobs_app
|
|
11
|
+
from expunct_cli.commands.parse import app as parse_app
|
|
10
12
|
from expunct_cli.commands.policies import app as policies_app
|
|
11
13
|
from expunct_cli.commands.redact import app as redact_app
|
|
14
|
+
from expunct_cli.commands.safe_parse import app as safe_parse_app
|
|
12
15
|
|
|
13
16
|
app = typer.Typer(
|
|
14
17
|
name="expunct",
|
|
@@ -41,6 +44,21 @@ app.add_typer(jobs_app, name="jobs", help="Manage redaction jobs.")
|
|
|
41
44
|
app.add_typer(policies_app, name="policies", help="Manage redaction policies.")
|
|
42
45
|
app.add_typer(audit_app, name="audit", help="View audit logs.")
|
|
43
46
|
app.add_typer(config_app, name="config", help="Manage CLI configuration.")
|
|
47
|
+
app.add_typer(
|
|
48
|
+
parse_app,
|
|
49
|
+
name="parse",
|
|
50
|
+
help="Parse a PDF or DOCX into structured document artifacts.",
|
|
51
|
+
)
|
|
52
|
+
app.add_typer(
|
|
53
|
+
extract_app,
|
|
54
|
+
name="extract",
|
|
55
|
+
help="Extract structured fields from a parsed document.",
|
|
56
|
+
)
|
|
57
|
+
app.add_typer(
|
|
58
|
+
safe_parse_app,
|
|
59
|
+
name="safe-parse",
|
|
60
|
+
help="Parse a PDF or DOCX and sanitize PII in one workflow (beta).",
|
|
61
|
+
)
|
|
44
62
|
|
|
45
63
|
if __name__ == "__main__":
|
|
46
64
|
app()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.1.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|