classifyre-cli 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2/.gitignore +65 -0
- classifyre_cli-0.4.2/.python-version +1 -0
- classifyre_cli-0.4.2/.turbo/turbo-build.log +3 -0
- classifyre_cli-0.4.2/PKG-INFO +167 -0
- classifyre_cli-0.4.2/README.md +150 -0
- classifyre_cli-0.4.2/main.py +4 -0
- classifyre_cli-0.4.2/package.json +17 -0
- classifyre_cli-0.4.2/pyproject.toml +323 -0
- classifyre_cli-0.4.2/scripts/generate_models.py +88 -0
- classifyre_cli-0.4.2/src/__init__.py +1 -0
- classifyre_cli-0.4.2/src/detectors/__init__.py +105 -0
- classifyre_cli-0.4.2/src/detectors/base.py +97 -0
- classifyre_cli-0.4.2/src/detectors/broken_links/__init__.py +3 -0
- classifyre_cli-0.4.2/src/detectors/broken_links/detector.py +280 -0
- classifyre_cli-0.4.2/src/detectors/config.py +59 -0
- classifyre_cli-0.4.2/src/detectors/content/__init__.py +0 -0
- classifyre_cli-0.4.2/src/detectors/custom/__init__.py +13 -0
- classifyre_cli-0.4.2/src/detectors/custom/detector.py +45 -0
- classifyre_cli-0.4.2/src/detectors/custom/runners/__init__.py +56 -0
- classifyre_cli-0.4.2/src/detectors/custom/runners/_base.py +177 -0
- classifyre_cli-0.4.2/src/detectors/custom/runners/_factory.py +51 -0
- classifyre_cli-0.4.2/src/detectors/custom/runners/_feature_extraction.py +138 -0
- classifyre_cli-0.4.2/src/detectors/custom/runners/_gliner2.py +324 -0
- classifyre_cli-0.4.2/src/detectors/custom/runners/_image_classification.py +98 -0
- classifyre_cli-0.4.2/src/detectors/custom/runners/_llm.py +22 -0
- classifyre_cli-0.4.2/src/detectors/custom/runners/_object_detection.py +107 -0
- classifyre_cli-0.4.2/src/detectors/custom/runners/_regex.py +147 -0
- classifyre_cli-0.4.2/src/detectors/custom/runners/_text_classification.py +109 -0
- classifyre_cli-0.4.2/src/detectors/custom/trainer.py +293 -0
- classifyre_cli-0.4.2/src/detectors/dependencies.py +109 -0
- classifyre_cli-0.4.2/src/detectors/pii/__init__.py +0 -0
- classifyre_cli-0.4.2/src/detectors/pii/detector.py +883 -0
- classifyre_cli-0.4.2/src/detectors/secrets/__init__.py +0 -0
- classifyre_cli-0.4.2/src/detectors/secrets/detector.py +399 -0
- classifyre_cli-0.4.2/src/detectors/threat/__init__.py +0 -0
- classifyre_cli-0.4.2/src/detectors/threat/code_security_detector.py +206 -0
- classifyre_cli-0.4.2/src/detectors/threat/yara_detector.py +177 -0
- classifyre_cli-0.4.2/src/main.py +608 -0
- classifyre_cli-0.4.2/src/models/generated_detectors.py +1296 -0
- classifyre_cli-0.4.2/src/models/generated_input.py +2732 -0
- classifyre_cli-0.4.2/src/models/generated_single_asset_scan_results.py +240 -0
- classifyre_cli-0.4.2/src/outputs/__init__.py +3 -0
- classifyre_cli-0.4.2/src/outputs/base.py +69 -0
- classifyre_cli-0.4.2/src/outputs/console.py +62 -0
- classifyre_cli-0.4.2/src/outputs/factory.py +156 -0
- classifyre_cli-0.4.2/src/outputs/file.py +83 -0
- classifyre_cli-0.4.2/src/outputs/rest.py +258 -0
- classifyre_cli-0.4.2/src/pipeline/__init__.py +7 -0
- classifyre_cli-0.4.2/src/pipeline/content_provider.py +26 -0
- classifyre_cli-0.4.2/src/pipeline/detector_pipeline.py +742 -0
- classifyre_cli-0.4.2/src/pipeline/parsed_content_provider.py +59 -0
- classifyre_cli-0.4.2/src/sandbox/__init__.py +5 -0
- classifyre_cli-0.4.2/src/sandbox/runner.py +145 -0
- classifyre_cli-0.4.2/src/sources/__init__.py +95 -0
- classifyre_cli-0.4.2/src/sources/atlassian_common.py +389 -0
- classifyre_cli-0.4.2/src/sources/azure_blob_storage/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/azure_blob_storage/source.py +130 -0
- classifyre_cli-0.4.2/src/sources/base.py +296 -0
- classifyre_cli-0.4.2/src/sources/confluence/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/confluence/source.py +733 -0
- classifyre_cli-0.4.2/src/sources/databricks/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/databricks/source.py +1279 -0
- classifyre_cli-0.4.2/src/sources/dependencies.py +81 -0
- classifyre_cli-0.4.2/src/sources/google_cloud_storage/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/google_cloud_storage/source.py +114 -0
- classifyre_cli-0.4.2/src/sources/hive/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/hive/source.py +709 -0
- classifyre_cli-0.4.2/src/sources/jira/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/jira/source.py +605 -0
- classifyre_cli-0.4.2/src/sources/mongodb/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/mongodb/source.py +550 -0
- classifyre_cli-0.4.2/src/sources/mssql/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/mssql/source.py +1034 -0
- classifyre_cli-0.4.2/src/sources/mysql/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/mysql/source.py +797 -0
- classifyre_cli-0.4.2/src/sources/neo4j/__init__.py +0 -0
- classifyre_cli-0.4.2/src/sources/neo4j/source.py +523 -0
- classifyre_cli-0.4.2/src/sources/object_storage/base.py +679 -0
- classifyre_cli-0.4.2/src/sources/oracle/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/oracle/source.py +982 -0
- classifyre_cli-0.4.2/src/sources/postgresql/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/postgresql/source.py +774 -0
- classifyre_cli-0.4.2/src/sources/powerbi/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/powerbi/source.py +774 -0
- classifyre_cli-0.4.2/src/sources/recipe_normalizer.py +179 -0
- classifyre_cli-0.4.2/src/sources/s3_compatible_storage/README.md +66 -0
- classifyre_cli-0.4.2/src/sources/s3_compatible_storage/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/s3_compatible_storage/source.py +150 -0
- classifyre_cli-0.4.2/src/sources/servicedesk/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/servicedesk/source.py +620 -0
- classifyre_cli-0.4.2/src/sources/slack/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/slack/source.py +534 -0
- classifyre_cli-0.4.2/src/sources/snowflake/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/snowflake/source.py +912 -0
- classifyre_cli-0.4.2/src/sources/tableau/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/tableau/source.py +799 -0
- classifyre_cli-0.4.2/src/sources/tabular_utils.py +165 -0
- classifyre_cli-0.4.2/src/sources/wordpress/__init__.py +3 -0
- classifyre_cli-0.4.2/src/sources/wordpress/source.py +590 -0
- classifyre_cli-0.4.2/src/telemetry.py +96 -0
- classifyre_cli-0.4.2/src/utils/__init__.py +1 -0
- classifyre_cli-0.4.2/src/utils/content_extraction.py +108 -0
- classifyre_cli-0.4.2/src/utils/file_parser.py +777 -0
- classifyre_cli-0.4.2/src/utils/hashing.py +82 -0
- classifyre_cli-0.4.2/src/utils/uv_sync.py +79 -0
- classifyre_cli-0.4.2/src/utils/validation.py +56 -0
- classifyre_cli-0.4.2/tests/__init__.py +0 -0
- classifyre_cli-0.4.2/tests/conftest.py +21 -0
- classifyre_cli-0.4.2/tests/detectors/__init__.py +0 -0
- classifyre_cli-0.4.2/tests/detectors/broken_links/test_broken_links_detector.py +100 -0
- classifyre_cli-0.4.2/tests/detectors/conftest.py +173 -0
- classifyre_cli-0.4.2/tests/detectors/content/__init__.py +0 -0
- classifyre_cli-0.4.2/tests/detectors/custom/__init__.py +0 -0
- classifyre_cli-0.4.2/tests/detectors/custom/conftest.py +21 -0
- classifyre_cli-0.4.2/tests/detectors/custom/test_invoice_extraction.py +159 -0
- classifyre_cli-0.4.2/tests/detectors/custom/test_pipeline_integration.py +220 -0
- classifyre_cli-0.4.2/tests/detectors/custom/test_regex_runner.py +367 -0
- classifyre_cli-0.4.2/tests/detectors/custom/test_transformer_runners.py +329 -0
- classifyre_cli-0.4.2/tests/detectors/pii/__init__.py +0 -0
- classifyre_cli-0.4.2/tests/detectors/pii/conftest.py +19 -0
- classifyre_cli-0.4.2/tests/detectors/pii/sample_invoice.pdf +0 -0
- classifyre_cli-0.4.2/tests/detectors/pii/test_pii_detector.py +511 -0
- classifyre_cli-0.4.2/tests/detectors/pii/test_pii_detector_extended.py +177 -0
- classifyre_cli-0.4.2/tests/detectors/secrets/__init__.py +0 -0
- classifyre_cli-0.4.2/tests/detectors/secrets/test_secrets_detector.py +267 -0
- classifyre_cli-0.4.2/tests/detectors/secrets/test_secrets_detector_extended.py +213 -0
- classifyre_cli-0.4.2/tests/detectors/test_base_detector.py +147 -0
- classifyre_cli-0.4.2/tests/detectors/test_custom_detector_examples_runtime.py +157 -0
- classifyre_cli-0.4.2/tests/detectors/test_detector_catalog_commercial.py +72 -0
- classifyre_cli-0.4.2/tests/detectors/test_detector_pipeline_types.py +344 -0
- classifyre_cli-0.4.2/tests/detectors/test_detector_schema_examples.py +134 -0
- classifyre_cli-0.4.2/tests/detectors/test_detector_types.py +253 -0
- classifyre_cli-0.4.2/tests/detectors/test_phase2_detectors.py +1 -0
- classifyre_cli-0.4.2/tests/detectors/test_registry.py +40 -0
- classifyre_cli-0.4.2/tests/detectors/threat/__init__.py +0 -0
- classifyre_cli-0.4.2/tests/detectors/threat/test_code_security_detector.py +178 -0
- classifyre_cli-0.4.2/tests/detectors/threat/test_yara_detector.py +332 -0
- classifyre_cli-0.4.2/tests/integration/test_wordpress_broken_links_detector.py +122 -0
- classifyre_cli-0.4.2/tests/integration/test_wordpress_links_assets.py +101 -0
- classifyre_cli-0.4.2/tests/pipeline/test_detector_pipeline.py +657 -0
- classifyre_cli-0.4.2/tests/test_azure_blob_storage_source.py +83 -0
- classifyre_cli-0.4.2/tests/test_base_source_attachment.py +102 -0
- classifyre_cli-0.4.2/tests/test_base_source_sampling.py +48 -0
- classifyre_cli-0.4.2/tests/test_confluence_source.py +314 -0
- classifyre_cli-0.4.2/tests/test_databricks_source.py +417 -0
- classifyre_cli-0.4.2/tests/test_google_cloud_storage_source.py +74 -0
- classifyre_cli-0.4.2/tests/test_hashing.py +108 -0
- classifyre_cli-0.4.2/tests/test_hive_source.py +316 -0
- classifyre_cli-0.4.2/tests/test_jira_source.py +401 -0
- classifyre_cli-0.4.2/tests/test_mongodb_source.py +347 -0
- classifyre_cli-0.4.2/tests/test_mssql_source.py +429 -0
- classifyre_cli-0.4.2/tests/test_mysql_source.py +362 -0
- classifyre_cli-0.4.2/tests/test_neo4j_source.py +395 -0
- classifyre_cli-0.4.2/tests/test_oracle_source.py +334 -0
- classifyre_cli-0.4.2/tests/test_outputs.py +335 -0
- classifyre_cli-0.4.2/tests/test_postgresql_source.py +519 -0
- classifyre_cli-0.4.2/tests/test_powerbi_source.py +361 -0
- classifyre_cli-0.4.2/tests/test_recipe_normalizer.py +53 -0
- classifyre_cli-0.4.2/tests/test_s3_compatible_storage_source.py +213 -0
- classifyre_cli-0.4.2/tests/test_servicedesk_source.py +309 -0
- classifyre_cli-0.4.2/tests/test_slack_source.py +208 -0
- classifyre_cli-0.4.2/tests/test_snowflake_source.py +329 -0
- classifyre_cli-0.4.2/tests/test_source_dependency_groups.py +74 -0
- classifyre_cli-0.4.2/tests/test_tableau_source.py +361 -0
- classifyre_cli-0.4.2/tests/test_tabular_utils.py +156 -0
- classifyre_cli-0.4.2/tests/test_wordpress_source.py +287 -0
- classifyre_cli-0.4.2/tests/utils/test_content_extraction.py +150 -0
- classifyre_cli-0.4.2/tests/utils/test_file_parser.py +474 -0
- classifyre_cli-0.4.2/uv.lock +5560 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
|
|
8
|
+
# Virtual environments
|
|
9
|
+
.venv/
|
|
10
|
+
venv/
|
|
11
|
+
ENV/
|
|
12
|
+
env/
|
|
13
|
+
|
|
14
|
+
# Distribution / packaging
|
|
15
|
+
.Python
|
|
16
|
+
build/
|
|
17
|
+
develop-eggs/
|
|
18
|
+
dist/
|
|
19
|
+
downloads/
|
|
20
|
+
eggs/
|
|
21
|
+
.eggs/
|
|
22
|
+
lib/
|
|
23
|
+
lib64/
|
|
24
|
+
parts/
|
|
25
|
+
sdist/
|
|
26
|
+
var/
|
|
27
|
+
wheels/
|
|
28
|
+
*.egg-info/
|
|
29
|
+
.installed.cfg
|
|
30
|
+
*.egg
|
|
31
|
+
|
|
32
|
+
# PyInstaller
|
|
33
|
+
*.manifest
|
|
34
|
+
*.spec
|
|
35
|
+
|
|
36
|
+
# Unit test / coverage reports
|
|
37
|
+
htmlcov/
|
|
38
|
+
.tox/
|
|
39
|
+
.coverage
|
|
40
|
+
.coverage.*
|
|
41
|
+
.cache
|
|
42
|
+
nosetests.xml
|
|
43
|
+
coverage.xml
|
|
44
|
+
*.cover
|
|
45
|
+
.hypothesis/
|
|
46
|
+
.pytest_cache/
|
|
47
|
+
|
|
48
|
+
# mypy
|
|
49
|
+
.mypy_cache/
|
|
50
|
+
.dmypy.json
|
|
51
|
+
dmypy.json
|
|
52
|
+
|
|
53
|
+
# ruff
|
|
54
|
+
.ruff_cache/
|
|
55
|
+
|
|
56
|
+
# IDEs
|
|
57
|
+
.vscode/
|
|
58
|
+
.idea/
|
|
59
|
+
*.swp
|
|
60
|
+
*.swo
|
|
61
|
+
*~
|
|
62
|
+
.DS_Store
|
|
63
|
+
|
|
64
|
+
# Local training artifacts
|
|
65
|
+
checkpoints/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: classifyre-cli
|
|
3
|
+
Version: 0.4.2
|
|
4
|
+
Summary: Classifyre CLI — scan and classify unstructured data sources
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: data,ingestion,metadata,pii,secrets,unstructured
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
9
|
+
Requires-Dist: classifyre-schemas
|
|
10
|
+
Requires-Dist: email-validator>=2.3.0
|
|
11
|
+
Requires-Dist: en-core-web-sm
|
|
12
|
+
Requires-Dist: jsonschema>=4.26.0
|
|
13
|
+
Requires-Dist: lxml>=6.1.1
|
|
14
|
+
Requires-Dist: pydantic>=2.13.4
|
|
15
|
+
Requires-Dist: requests>=2.34.2
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# CLI Application
|
|
19
|
+
|
|
20
|
+
Python CLI for source extraction, detector execution, and batched output delivery.
|
|
21
|
+
|
|
22
|
+
## Setup
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
cd /unstructured/apps/cli
|
|
26
|
+
uv sync
|
|
27
|
+
# Optional if you want an activated shell instead of `uv run ...`:
|
|
28
|
+
source .venv/bin/activate
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Optional detector groups:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
uv sync --group detectors
|
|
35
|
+
# or specific groups: --group secrets --group pii --group threat ...
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Command Syntax
|
|
39
|
+
|
|
40
|
+
Use the thin wrapper:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
uv run main.py <command> <recipe.json> [options]
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or direct module entrypoint:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
uv run python -m src.main <command> <recipe.json> [options]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Commands:
|
|
53
|
+
|
|
54
|
+
- `test` - test source connection.
|
|
55
|
+
- `discover` - discover source resources.
|
|
56
|
+
- `extract` - run extraction and emit batched output.
|
|
57
|
+
- `sandbox` - run sandbox parsing/detectors for a local file.
|
|
58
|
+
|
|
59
|
+
## Extract Output Model
|
|
60
|
+
|
|
61
|
+
Extraction always emits in batches.
|
|
62
|
+
Recipes do not contain `output` configuration; output is controlled by CLI flags and environment variables.
|
|
63
|
+
|
|
64
|
+
Output types:
|
|
65
|
+
|
|
66
|
+
- `console` - emits NDJSON envelopes to stdout.
|
|
67
|
+
- `file` - appends NDJSON envelopes to a file.
|
|
68
|
+
- `rest` - pushes batches to API endpoints and finalizes run.
|
|
69
|
+
|
|
70
|
+
Default behavior:
|
|
71
|
+
|
|
72
|
+
- If `source_id` is present (`--source-id` or `SOURCE_ID` env), default output is `rest`.
|
|
73
|
+
- Otherwise default output is `console`.
|
|
74
|
+
- Default batch size is `20`.
|
|
75
|
+
|
|
76
|
+
## CLI Options
|
|
77
|
+
|
|
78
|
+
Global/common:
|
|
79
|
+
|
|
80
|
+
- `--debug` - enable debug logging.
|
|
81
|
+
- `--detectors-file <path>` - sandbox only.
|
|
82
|
+
|
|
83
|
+
Extract output options:
|
|
84
|
+
|
|
85
|
+
- `--output-type rest|file|console`
|
|
86
|
+
- `--output-batch-size <int>`
|
|
87
|
+
- `--output-rest-url <url>`
|
|
88
|
+
- `--output-file-path <path>`
|
|
89
|
+
- `--source-id <uuid>`
|
|
90
|
+
- `--runner-id <uuid>`
|
|
91
|
+
- `--managed-runner` (REST only; runner lifecycle managed by API orchestrator)
|
|
92
|
+
|
|
93
|
+
Environment fallbacks:
|
|
94
|
+
|
|
95
|
+
- `SOURCE_ID`, `RUNNER_ID`
|
|
96
|
+
- `CLASSIFYRE_OUTPUT_TYPE`, `CLASSIFYRE_OUTPUT_BATCH_SIZE`
|
|
97
|
+
- `CLASSIFYRE_OUTPUT_REST_URL`, `CLASSIFYRE_OUTPUT_REST_TIMEOUT_SEC`
|
|
98
|
+
- `CLASSIFYRE_OUTPUT_FILE_PATH`
|
|
99
|
+
- `API_URL` (fallback base URL for REST output)
|
|
100
|
+
|
|
101
|
+
## Practical Examples
|
|
102
|
+
|
|
103
|
+
### 1) Console output (quick local test)
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
uv run main.py extract ./wordpress-recipe.json --output-type console --output-batch-size 1
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
You will see NDJSON lines like:
|
|
110
|
+
|
|
111
|
+
- `{"event":"batch", ...}`
|
|
112
|
+
- `{"event":"finish", ...}`
|
|
113
|
+
|
|
114
|
+
### 2) File output
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
118
|
+
--output-type file \
|
|
119
|
+
--output-file-path /tmp/classifyre-assets.ndjson \
|
|
120
|
+
--output-batch-size 20
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### 3) REST output (manual CLI to backend)
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
127
|
+
--output-type rest \
|
|
128
|
+
--source-id <source_uuid>
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Notes:
|
|
132
|
+
|
|
133
|
+
- `--runner-id` optional for manual runs. If omitted, CLI creates external runner automatically.
|
|
134
|
+
- `--output-rest-url` is optional. If omitted, CLI uses `CLASSIFYRE_OUTPUT_REST_URL`, then `API_URL`, then `http://localhost:8000`.
|
|
135
|
+
- `--managed-runner` should be used only for API-orchestrated runs where runner already exists.
|
|
136
|
+
|
|
137
|
+
### 4) REST output with explicit runner (managed/orchestrated style)
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
141
|
+
--output-type rest \
|
|
142
|
+
--source-id <source_uuid> \
|
|
143
|
+
--runner-id <runner_uuid> \
|
|
144
|
+
--managed-runner
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### 5) Full extract command with all output flags
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
151
|
+
--output-type rest \
|
|
152
|
+
--output-batch-size 20 \
|
|
153
|
+
--output-rest-url http://localhost:8000 \
|
|
154
|
+
--output-file-path /tmp/classifyre-assets.ndjson \
|
|
155
|
+
--source-id <source_uuid> \
|
|
156
|
+
--runner-id <runner_uuid> \
|
|
157
|
+
--managed-runner
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Use `--output-file-path` only when `--output-type file`.
|
|
161
|
+
|
|
162
|
+
## Dev Scripts
|
|
163
|
+
|
|
164
|
+
- `bun run dev` - run CLI quickly.
|
|
165
|
+
- `bun run lint` - ruff format/check.
|
|
166
|
+
- `bun run check-types` - mypy.
|
|
167
|
+
- `bun run test` - pytest suite.
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# CLI Application
|
|
2
|
+
|
|
3
|
+
Python CLI for source extraction, detector execution, and batched output delivery.
|
|
4
|
+
|
|
5
|
+
## Setup
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
cd /unstructured/apps/cli
|
|
9
|
+
uv sync
|
|
10
|
+
# Optional if you want an activated shell instead of `uv run ...`:
|
|
11
|
+
source .venv/bin/activate
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Optional detector groups:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
uv sync --group detectors
|
|
18
|
+
# or specific groups: --group secrets --group pii --group threat ...
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Command Syntax
|
|
22
|
+
|
|
23
|
+
Use the thin wrapper:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
uv run main.py <command> <recipe.json> [options]
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Or direct module entrypoint:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
uv run python -m src.main <command> <recipe.json> [options]
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Commands:
|
|
36
|
+
|
|
37
|
+
- `test` - test source connection.
|
|
38
|
+
- `discover` - discover source resources.
|
|
39
|
+
- `extract` - run extraction and emit batched output.
|
|
40
|
+
- `sandbox` - run sandbox parsing/detectors for a local file.
|
|
41
|
+
|
|
42
|
+
## Extract Output Model
|
|
43
|
+
|
|
44
|
+
Extraction always emits in batches.
|
|
45
|
+
Recipes do not contain `output` configuration; output is controlled by CLI flags and environment variables.
|
|
46
|
+
|
|
47
|
+
Output types:
|
|
48
|
+
|
|
49
|
+
- `console` - emits NDJSON envelopes to stdout.
|
|
50
|
+
- `file` - appends NDJSON envelopes to a file.
|
|
51
|
+
- `rest` - pushes batches to API endpoints and finalizes run.
|
|
52
|
+
|
|
53
|
+
Default behavior:
|
|
54
|
+
|
|
55
|
+
- If `source_id` is present (`--source-id` or `SOURCE_ID` env), default output is `rest`.
|
|
56
|
+
- Otherwise default output is `console`.
|
|
57
|
+
- Default batch size is `20`.
|
|
58
|
+
|
|
59
|
+
## CLI Options
|
|
60
|
+
|
|
61
|
+
Global/common:
|
|
62
|
+
|
|
63
|
+
- `--debug` - enable debug logging.
|
|
64
|
+
- `--detectors-file <path>` - sandbox only.
|
|
65
|
+
|
|
66
|
+
Extract output options:
|
|
67
|
+
|
|
68
|
+
- `--output-type rest|file|console`
|
|
69
|
+
- `--output-batch-size <int>`
|
|
70
|
+
- `--output-rest-url <url>`
|
|
71
|
+
- `--output-file-path <path>`
|
|
72
|
+
- `--source-id <uuid>`
|
|
73
|
+
- `--runner-id <uuid>`
|
|
74
|
+
- `--managed-runner` (REST only; runner lifecycle managed by API orchestrator)
|
|
75
|
+
|
|
76
|
+
Environment fallbacks:
|
|
77
|
+
|
|
78
|
+
- `SOURCE_ID`, `RUNNER_ID`
|
|
79
|
+
- `CLASSIFYRE_OUTPUT_TYPE`, `CLASSIFYRE_OUTPUT_BATCH_SIZE`
|
|
80
|
+
- `CLASSIFYRE_OUTPUT_REST_URL`, `CLASSIFYRE_OUTPUT_REST_TIMEOUT_SEC`
|
|
81
|
+
- `CLASSIFYRE_OUTPUT_FILE_PATH`
|
|
82
|
+
- `API_URL` (fallback base URL for REST output)
|
|
83
|
+
|
|
84
|
+
## Practical Examples
|
|
85
|
+
|
|
86
|
+
### 1) Console output (quick local test)
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
uv run main.py extract ./wordpress-recipe.json --output-type console --output-batch-size 1
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
You will see NDJSON lines like:
|
|
93
|
+
|
|
94
|
+
- `{"event":"batch", ...}`
|
|
95
|
+
- `{"event":"finish", ...}`
|
|
96
|
+
|
|
97
|
+
### 2) File output
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
101
|
+
--output-type file \
|
|
102
|
+
--output-file-path /tmp/classifyre-assets.ndjson \
|
|
103
|
+
--output-batch-size 20
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 3) REST output (manual CLI to backend)
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
110
|
+
--output-type rest \
|
|
111
|
+
--source-id <source_uuid>
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Notes:
|
|
115
|
+
|
|
116
|
+
- `--runner-id` optional for manual runs. If omitted, CLI creates external runner automatically.
|
|
117
|
+
- `--output-rest-url` is optional. If omitted, CLI uses `CLASSIFYRE_OUTPUT_REST_URL`, then `API_URL`, then `http://localhost:8000`.
|
|
118
|
+
- `--managed-runner` should be used only for API-orchestrated runs where runner already exists.
|
|
119
|
+
|
|
120
|
+
### 4) REST output with explicit runner (managed/orchestrated style)
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
124
|
+
--output-type rest \
|
|
125
|
+
--source-id <source_uuid> \
|
|
126
|
+
--runner-id <runner_uuid> \
|
|
127
|
+
--managed-runner
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### 5) Full extract command with all output flags
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
134
|
+
--output-type rest \
|
|
135
|
+
--output-batch-size 20 \
|
|
136
|
+
--output-rest-url http://localhost:8000 \
|
|
137
|
+
--output-file-path /tmp/classifyre-assets.ndjson \
|
|
138
|
+
--source-id <source_uuid> \
|
|
139
|
+
--runner-id <runner_uuid> \
|
|
140
|
+
--managed-runner
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Use `--output-file-path` only when `--output-type file`.
|
|
144
|
+
|
|
145
|
+
## Dev Scripts
|
|
146
|
+
|
|
147
|
+
- `bun run dev` - run CLI quickly.
|
|
148
|
+
- `bun run lint` - ruff format/check.
|
|
149
|
+
- `bun run check-types` - mypy.
|
|
150
|
+
- `bun run test` - pytest suite.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@classifyre/cli",
|
|
3
|
+
"version": "0.4.2",
|
|
4
|
+
"private": true,
|
|
5
|
+
"scripts": {
|
|
6
|
+
"build": "uv sync",
|
|
7
|
+
"dev": "uv run main.py",
|
|
8
|
+
"lint": "uv run ruff check . --fix && uv run ruff format .",
|
|
9
|
+
"check-types": "uv run mypy .",
|
|
10
|
+
"test": "uv sync --group dev --group file-processing && uv run pytest",
|
|
11
|
+
"test:integration": "uv run pytest tests/integration/ -m integration",
|
|
12
|
+
"test:integration:run": "RUN_INTEGRATION_TESTS=1 uv run pytest tests/integration/ -m integration",
|
|
13
|
+
"e2e": "bun run test:e2e",
|
|
14
|
+
"test:e2e": "if rg -q '@pytest\\.mark\\.e2e' tests; then RUN_E2E_TESTS=1 uv run pytest -m e2e; else echo 'No CLI e2e tests collected; treating as pass'; fi",
|
|
15
|
+
"codegen": "uv run --python 3.12 --group dev --locked python scripts/generate_models.py"
|
|
16
|
+
}
|
|
17
|
+
}
|