classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: classifyre-cli
|
|
3
|
+
Version: 0.4.2
|
|
4
|
+
Summary: Classifyre CLI — scan and classify unstructured data sources
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: data,ingestion,metadata,pii,secrets,unstructured
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
9
|
+
Requires-Dist: classifyre-schemas
|
|
10
|
+
Requires-Dist: email-validator>=2.3.0
|
|
11
|
+
Requires-Dist: en-core-web-sm
|
|
12
|
+
Requires-Dist: jsonschema>=4.26.0
|
|
13
|
+
Requires-Dist: lxml>=6.1.1
|
|
14
|
+
Requires-Dist: pydantic>=2.13.4
|
|
15
|
+
Requires-Dist: requests>=2.34.2
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# CLI Application
|
|
19
|
+
|
|
20
|
+
Python CLI for source extraction, detector execution, and batched output delivery.
|
|
21
|
+
|
|
22
|
+
## Setup
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
cd /unstructured/apps/cli
|
|
26
|
+
uv sync
|
|
27
|
+
# Optional if you want an activated shell instead of `uv run ...`:
|
|
28
|
+
source .venv/bin/activate
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Optional detector groups:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
uv sync --group detectors
|
|
35
|
+
# or specific groups: --group secrets --group pii --group threat ...
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Command Syntax
|
|
39
|
+
|
|
40
|
+
Use the thin wrapper:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
uv run main.py <command> <recipe.json> [options]
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Or direct module entrypoint:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
uv run python -m src.main <command> <recipe.json> [options]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Commands:
|
|
53
|
+
|
|
54
|
+
- `test` - test source connection.
|
|
55
|
+
- `discover` - discover source resources.
|
|
56
|
+
- `extract` - run extraction and emit batched output.
|
|
57
|
+
- `sandbox` - run sandbox parsing/detectors for a local file.
|
|
58
|
+
|
|
59
|
+
## Extract Output Model
|
|
60
|
+
|
|
61
|
+
Extraction always emits in batches.
|
|
62
|
+
Recipes do not contain `output` configuration; output is controlled by CLI flags and environment variables.
|
|
63
|
+
|
|
64
|
+
Output types:
|
|
65
|
+
|
|
66
|
+
- `console` - emits NDJSON envelopes to stdout.
|
|
67
|
+
- `file` - appends NDJSON envelopes to a file.
|
|
68
|
+
- `rest` - pushes batches to API endpoints and finalizes run.
|
|
69
|
+
|
|
70
|
+
Default behavior:
|
|
71
|
+
|
|
72
|
+
- If `source_id` is present (`--source-id` or `SOURCE_ID` env), default output is `rest`.
|
|
73
|
+
- Otherwise default output is `console`.
|
|
74
|
+
- Default batch size is `20`.
|
|
75
|
+
|
|
76
|
+
## CLI Options
|
|
77
|
+
|
|
78
|
+
Global/common:
|
|
79
|
+
|
|
80
|
+
- `--debug` - enable debug logging.
|
|
81
|
+
- `--detectors-file <path>` - sandbox only.
|
|
82
|
+
|
|
83
|
+
Extract output options:
|
|
84
|
+
|
|
85
|
+
- `--output-type rest|file|console`
|
|
86
|
+
- `--output-batch-size <int>`
|
|
87
|
+
- `--output-rest-url <url>`
|
|
88
|
+
- `--output-file-path <path>`
|
|
89
|
+
- `--source-id <uuid>`
|
|
90
|
+
- `--runner-id <uuid>`
|
|
91
|
+
- `--managed-runner` (REST only; runner lifecycle managed by API orchestrator)
|
|
92
|
+
|
|
93
|
+
Environment fallbacks:
|
|
94
|
+
|
|
95
|
+
- `SOURCE_ID`, `RUNNER_ID`
|
|
96
|
+
- `CLASSIFYRE_OUTPUT_TYPE`, `CLASSIFYRE_OUTPUT_BATCH_SIZE`
|
|
97
|
+
- `CLASSIFYRE_OUTPUT_REST_URL`, `CLASSIFYRE_OUTPUT_REST_TIMEOUT_SEC`
|
|
98
|
+
- `CLASSIFYRE_OUTPUT_FILE_PATH`
|
|
99
|
+
- `API_URL` (fallback base URL for REST output)
|
|
100
|
+
|
|
101
|
+
## Practical Examples
|
|
102
|
+
|
|
103
|
+
### 1) Console output (quick local test)
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
uv run main.py extract ./wordpress-recipe.json --output-type console --output-batch-size 1
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
You will see NDJSON lines like:
|
|
110
|
+
|
|
111
|
+
- `{"event":"batch", ...}`
|
|
112
|
+
- `{"event":"finish", ...}`
|
|
113
|
+
|
|
114
|
+
### 2) File output
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
118
|
+
--output-type file \
|
|
119
|
+
--output-file-path /tmp/classifyre-assets.ndjson \
|
|
120
|
+
--output-batch-size 20
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### 3) REST output (manual CLI to backend)
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
127
|
+
--output-type rest \
|
|
128
|
+
--source-id <source_uuid>
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Notes:
|
|
132
|
+
|
|
133
|
+
- `--runner-id` optional for manual runs. If omitted, CLI creates external runner automatically.
|
|
134
|
+
- `--output-rest-url` is optional. If omitted, CLI uses `CLASSIFYRE_OUTPUT_REST_URL`, then `API_URL`, then `http://localhost:8000`.
|
|
135
|
+
- `--managed-runner` should be used only for API-orchestrated runs where runner already exists.
|
|
136
|
+
|
|
137
|
+
### 4) REST output with explicit runner (managed/orchestrated style)
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
141
|
+
--output-type rest \
|
|
142
|
+
--source-id <source_uuid> \
|
|
143
|
+
--runner-id <runner_uuid> \
|
|
144
|
+
--managed-runner
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### 5) Full extract command with all output flags
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
uv run main.py extract ./wordpress-recipe.json \
|
|
151
|
+
--output-type rest \
|
|
152
|
+
--output-batch-size 20 \
|
|
153
|
+
--output-rest-url http://localhost:8000 \
|
|
154
|
+
--output-file-path /tmp/classifyre-assets.ndjson \
|
|
155
|
+
--source-id <source_uuid> \
|
|
156
|
+
--runner-id <runner_uuid> \
|
|
157
|
+
--managed-runner
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Use `--output-file-path` only when `--output-type file`.
|
|
161
|
+
|
|
162
|
+
## Dev Scripts
|
|
163
|
+
|
|
164
|
+
- `bun run dev` - run CLI quickly.
|
|
165
|
+
- `bun run lint` - ruff format/check.
|
|
166
|
+
- `bun run check-types` - mypy.
|
|
167
|
+
- `bun run test` - pytest suite.
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
src/__init__.py,sha256=a9znVQMuKTEdH6nB8kUs7jMmPV1U6IFXDzCU5RnzxFo,23
|
|
2
|
+
src/main.py,sha256=L82WFjaZk7QsXHUNyoOFyxAvHUUSyfP9978kAjgWPUA,24046
|
|
3
|
+
src/telemetry.py,sha256=fbSmuJ_t-kSthfIh3E7ESVkloH22kzl8lQKY370YtuU,3089
|
|
4
|
+
src/detectors/__init__.py,sha256=BWrUF_GtSsfs1nQMvG9UhOpUokqwJ0-5Dyg498VMs4E,3475
|
|
5
|
+
src/detectors/base.py,sha256=bXcPmc_rZG5ONWm_ZgGLT-QpLqK8zXsdAUsguMb3FMg,2998
|
|
6
|
+
src/detectors/config.py,sha256=CCrzp8Js8tQ1E34_Pl9ChY8JFjvUalXbnJJ-QWk54d4,1683
|
|
7
|
+
src/detectors/dependencies.py,sha256=C5FBnkBMWCnx34WkOrM7yTO8zcOV_zP0RbyvMtw8A20,3851
|
|
8
|
+
src/detectors/broken_links/__init__.py,sha256=maN50pAfC9IJGNUUIHGVM4YnjTAozRI43APLUCQrJcE,77
|
|
9
|
+
src/detectors/broken_links/detector.py,sha256=tyFpevrE21PZLj1GHGjlmnGq3c7Lc1OSvYcrSDkpMv8,9330
|
|
10
|
+
src/detectors/content/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
src/detectors/custom/__init__.py,sha256=HfTzYuQFd7XJbIAhWndqScxbbL0AS7vaOc7p3dkFW6U,296
|
|
12
|
+
src/detectors/custom/detector.py,sha256=SiDyoaRdd7W6JHnI_aGN_U0paF2Q_wenhxPiq-WoxNk,1653
|
|
13
|
+
src/detectors/custom/trainer.py,sha256=xS-eg8Ic6qr_rtMvSlrePFVwrzt9hwC56awSw6gTLOM,11358
|
|
14
|
+
src/detectors/custom/runners/__init__.py,sha256=KWSni4-O2SVfODFhRXWvml_PSg58lShHiBkaEhH8D5Y,1663
|
|
15
|
+
src/detectors/custom/runners/_base.py,sha256=VhQ2fQgJSqsrjj93aDv4_Zw-_kho6TA-pJivewGHnbk,5790
|
|
16
|
+
src/detectors/custom/runners/_factory.py,sha256=Q60Nw2vK6GX5VrrDqwmJllEar10XTgh9BgRUv4GcWi0,2093
|
|
17
|
+
src/detectors/custom/runners/_feature_extraction.py,sha256=U2-EWGR52hqMiLJ10R7Gx72D3MZXNad1w_TEbPJoRo8,5577
|
|
18
|
+
src/detectors/custom/runners/_gliner2.py,sha256=WpEpWHrIFoPkL4WGQ7XWzHekHQ6VWQsmw3Bqm_WFrWU,12680
|
|
19
|
+
src/detectors/custom/runners/_image_classification.py,sha256=43DHRwyA2hXv7UAiXVOzUGzuT8M5IGTE6T9WVuO90Ng,4000
|
|
20
|
+
src/detectors/custom/runners/_llm.py,sha256=viqeuuT_5eC5pqTXXkzhSy1zJyLAj5lzjzHBvg8PONw,731
|
|
21
|
+
src/detectors/custom/runners/_object_detection.py,sha256=C9kfeRbf9GTXu33l8ieQdn4EiGYRWbeRsy0Vq6V4MJQ,4683
|
|
22
|
+
src/detectors/custom/runners/_regex.py,sha256=lq9YA9v12iUXVoqgWWq0dHiPltbnfCBGW9fvYYiAugg,5219
|
|
23
|
+
src/detectors/custom/runners/_text_classification.py,sha256=HhXvXyWAevAn29f8rfneH7GuGFDgmrYHGOe9nf-Bpic,4633
|
|
24
|
+
src/detectors/pii/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
src/detectors/pii/detector.py,sha256=mLGBom6tLmeTW9ARKb0LDeeyz17-j98UPlBZnX02CH4,33060
|
|
26
|
+
src/detectors/secrets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
|
+
src/detectors/secrets/detector.py,sha256=aq8s0shOj7_fK9YcturY_OX8W8IAcZfg9Von3G1VFcw,13895
|
|
28
|
+
src/detectors/threat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
|
+
src/detectors/threat/code_security_detector.py,sha256=iUWvHZLwVmhRM_WrOBgG1A_mIsdkNHv5KqcCb5Ap0JU,6900
|
|
30
|
+
src/detectors/threat/yara_detector.py,sha256=ZmxdiDnSIUgSAj0EhVUlRdlcwl4VjOM1CQTIc_fObNE,6275
|
|
31
|
+
src/models/generated_detectors.py,sha256=BfDMufATPOj5-d2AzUxdjLoxUN-IDVMTe6EwOUExoTg,42847
|
|
32
|
+
src/models/generated_input.py,sha256=LeJovpInCNnW0pyliCYUjk947qc6nDgjknsT0QoyA6k,82805
|
|
33
|
+
src/models/generated_single_asset_scan_results.py,sha256=ou4pyGt_Cqg9to_8drCuS_IaE0qGDWJKUHUSIFhZq4E,7269
|
|
34
|
+
src/outputs/__init__.py,sha256=41SESY2nC1y4zF0CTWn1dnTh9AK7qMdRxmRSN_GuWss,126
|
|
35
|
+
src/outputs/base.py,sha256=mRFTwqkaWLqyS55OkdbO49yBnSHXlaIGd_bzLyNGj9E,1556
|
|
36
|
+
src/outputs/console.py,sha256=OwPP7fXCWFML2zB4d8AxPXikanebI8CrixbgsS3CMGM,1803
|
|
37
|
+
src/outputs/factory.py,sha256=bNkBFHEsm3oOFIh5O6Gd9N1NpFQIBvO3woern8fw20g,4918
|
|
38
|
+
src/outputs/file.py,sha256=6y585mxJS04MgTcMu5KPkJt_hm9qLjnbPEzD4MgrmI4,2594
|
|
39
|
+
src/outputs/rest.py,sha256=C5VQOIocaIK9ZBqjjR4ZW6Vei8zC0uoA2rpv-v9e-RQ,8572
|
|
40
|
+
src/pipeline/__init__.py,sha256=AouA0wkqDV1AU6P3fTaQTbmxZCBoNzXZzUWnpfmnm2Q,286
|
|
41
|
+
src/pipeline/content_provider.py,sha256=v8-qW6bQ4YZ4XJgLWCGMrSQVbEbG2zHZDS985F9SKOM,854
|
|
42
|
+
src/pipeline/detector_pipeline.py,sha256=5uJ_flXC_mNnkwtF5jyjBmYfWAWN4F4eHEMMQAZ5We0,27752
|
|
43
|
+
src/pipeline/parsed_content_provider.py,sha256=B5QyqMpx_sscOOXQSVu92d_S2FqLSPKizYhwiS8jymw,1940
|
|
44
|
+
src/sandbox/__init__.py,sha256=4t_V5Wf_2CGvtVMMblRMrAhl1T8uFpFxi-JZrelaers,123
|
|
45
|
+
src/sandbox/runner.py,sha256=21L-POOKEChmOLhwyc2rGBGDpUpNVlPiG-Xqfh6ajDQ,5397
|
|
46
|
+
src/sources/__init__.py,sha256=wYksO_0IcvlGXLouWapKyqzL0jG0El4rDDGRfwa4Zqk,3350
|
|
47
|
+
src/sources/atlassian_common.py,sha256=9jqI03SgfstG-5nK5uHF8m4p6eVO263CkpCSim8h1iQ,11853
|
|
48
|
+
src/sources/base.py,sha256=wTUGBvXqHYkPOICdWTPp31g42KCxXV74Db4QGjpKwnM,10163
|
|
49
|
+
src/sources/dependencies.py,sha256=ei5P3S1CxnNN-zKlChKwzyWEeAApYggPM_YoqrSNWaU,2699
|
|
50
|
+
src/sources/recipe_normalizer.py,sha256=JmTdLsqens7_-O1YG--VeAnbphCtfgFhLJ9tJugQHcc,5573
|
|
51
|
+
src/sources/tabular_utils.py,sha256=AVpYLuNeud8WkMtdoNzrr_akV9yKN3t51ilkzOnnkGA,5387
|
|
52
|
+
src/sources/azure_blob_storage/__init__.py,sha256=EAsp7N2Vpzc_S-ownm3-ieXVWZSIVXPPNiSlDGCyenI,81
|
|
53
|
+
src/sources/azure_blob_storage/source.py,sha256=1hUKphzuyK5ga1cviQTgXQLeulYL173m8t0YgS2TOag,5384
|
|
54
|
+
src/sources/confluence/__init__.py,sha256=VnD7SbvheqWwXS0bgxo9M561UOKgjENkwngwlLHJ500,69
|
|
55
|
+
src/sources/confluence/source.py,sha256=rykXW-d8M0dUWCZjTg3fj0ut0140R5_oCbMMN3_x6jE,27381
|
|
56
|
+
src/sources/databricks/__init__.py,sha256=yZZxIF-qhpt_W_zksFQoJGBqaIA40vQCyYLFSksGRFM,69
|
|
57
|
+
src/sources/databricks/source.py,sha256=bDWjcg0Fs1vyMtyvzR3svWZyMHHS5JFuCgTiaNwOOWY,45219
|
|
58
|
+
src/sources/google_cloud_storage/__init__.py,sha256=HSVm-DeqzcajfBkJdi1bsKC4r8N6c6ZD_ZQTpAVnqE0,85
|
|
59
|
+
src/sources/google_cloud_storage/source.py,sha256=YUV5o9TY_CsRgDcma_5lugc1ltbQ1h36EtaMr6lSkl4,4454
|
|
60
|
+
src/sources/hive/__init__.py,sha256=AYWKwROWOgaMDdRTakg5g3vqQqjECKXiozADPr4MeSM,57
|
|
61
|
+
src/sources/hive/source.py,sha256=JAyrM1DBq1QWqI-L9rGlFXyN8G1Y0y0WnjhNptVEPmU,25949
|
|
62
|
+
src/sources/jira/__init__.py,sha256=p3G96i94D6ZrdHKE6bzgVtYvO8h2-Aet4zERRYdJAyA,57
|
|
63
|
+
src/sources/jira/source.py,sha256=Ogq1NiJ41k2MrjyIuakOLdxKYexXsARVfweaV7Obgiw,22027
|
|
64
|
+
src/sources/mongodb/__init__.py,sha256=IcSTuVMB3JOXAjYpwFcFvRuc2a7m6xgZ1VG5AfM4eJU,63
|
|
65
|
+
src/sources/mongodb/source.py,sha256=ydqBGxrryDtNG8n5LGxl4XmTYWDYHTQy2QcFREc-YtY,19985
|
|
66
|
+
src/sources/mssql/__init__.py,sha256=Y2WuywS2C8lIsiwbikGKQQ9S3Pbp4X5cHyXiDNz8JPI,59
|
|
67
|
+
src/sources/mssql/source.py,sha256=oTRbBRJlG4K-Em7LZ36k5ShQqYbhAvPtpXXqCbk0cOA,40106
|
|
68
|
+
src/sources/mysql/__init__.py,sha256=V0hGtrFJLjGpc3iXzt2k8fH0WzAnyuuDG_2FogcwJBE,59
|
|
69
|
+
src/sources/mysql/source.py,sha256=5RnjPBMYagQo0K9wr0tVjvhP4zBAUF2hFPzVBXg8Cpg,30257
|
|
70
|
+
src/sources/neo4j/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
+
src/sources/neo4j/source.py,sha256=c230muZyihStMA1cOiSbNdP-v--DSwQR6018XryqTvM,18266
|
|
72
|
+
src/sources/object_storage/base.py,sha256=_NJaqOyNBs_wJAp_wUcxcuekTLgKcB0xRActrztxgGM,24477
|
|
73
|
+
src/sources/oracle/__init__.py,sha256=saYMyNR-UGniyVgLYS8u1PwPO5WQ_mEuq8WqYrl30rU,61
|
|
74
|
+
src/sources/oracle/source.py,sha256=6iBoK4hLQia6sEUgIhPxGQVLwH0w-p6skL7BZo6PRYI,36949
|
|
75
|
+
src/sources/postgresql/__init__.py,sha256=lHNYhsvG9fdnrr4b2Ya1mSdDdDRImmWOUEpQSXSu9dE,69
|
|
76
|
+
src/sources/postgresql/source.py,sha256=nBCk4fXitI7-z6DKtSXRVzHcWPspY253O6olpsP8t8g,30234
|
|
77
|
+
src/sources/powerbi/__init__.py,sha256=oDQQoV94r8xHqNjlCeFOULA-8Wyzr6tNCGznFEHEHF0,63
|
|
78
|
+
src/sources/powerbi/source.py,sha256=NjKEOkm4lym2O6Tu7F87vJXwy9eMkQoMfTblPgXmqz8,28906
|
|
79
|
+
src/sources/s3_compatible_storage/README.md,sha256=ud8h0cQ9OIwb2sw_3K3rBdTz2MCe2ratSKkcOGS1Lv4,2660
|
|
80
|
+
src/sources/s3_compatible_storage/__init__.py,sha256=AH0ZNp34GXHC-RXnXk3wQg9HPgck8yTlFF4ygjixj0k,87
|
|
81
|
+
src/sources/s3_compatible_storage/source.py,sha256=Jq9USIZ4wOZgaSwsAGl9YXPLrYHQMqroMjzdk7qno28,5578
|
|
82
|
+
src/sources/servicedesk/__init__.py,sha256=LZAcPnWYPnLvH-FpqH4x_AtipFNToOOn86qaJXG5ohM,71
|
|
83
|
+
src/sources/servicedesk/source.py,sha256=zzJffdcPlz51n8tOgdKHko1bOYCeU4Hq5BIM6g6HExc,23340
|
|
84
|
+
src/sources/slack/__init__.py,sha256=k-3Or9wLb48KXjDXqQzrUc7VenJtb5SucyoFhGKdiow,59
|
|
85
|
+
src/sources/slack/source.py,sha256=9cQI8e8fd1nzcZG2oaM72PxMB2vR5_Yf33IlCQGHEXA,18029
|
|
86
|
+
src/sources/snowflake/__init__.py,sha256=NGXckaCGaD6eHCIlHs4aErhOxCc3BbNDPvdbV_xx24g,67
|
|
87
|
+
src/sources/snowflake/source.py,sha256=fFen1NiCQOcDJ2SIldgeNgU1v8fjcW4EgfRbWsXyaSw,35565
|
|
88
|
+
src/sources/tableau/__init__.py,sha256=Di1Bh9s9SYrUYg4xZhhAhl5HfoTt0B6WQB5SxVrQhbY,63
|
|
89
|
+
src/sources/tableau/source.py,sha256=F-nX_DR1EO84OZMH2zCSfZgwCmibyI613kEB4YANnUM,30025
|
|
90
|
+
src/sources/wordpress/__init__.py,sha256=T7v2TGUYiKzARpYk9bynxChY0P9GDDxpdbBz34x9g6o,67
|
|
91
|
+
src/sources/wordpress/source.py,sha256=xs3dsc8FXXDJ5725eSaPvFnx2rN_g1520OWisceMlpA,21659
|
|
92
|
+
src/utils/__init__.py,sha256=qHkwUQnAipyD7orncfDbJCaQ47mc02h90FgWy9ad_2o,25
|
|
93
|
+
src/utils/content_extraction.py,sha256=Pqm_qMID2mQH1zc8099fNl-xIaQi3hOxQPWzK3yoKac,3082
|
|
94
|
+
src/utils/file_parser.py,sha256=3LkNbnknDnaYJ8f8y2KzNiyVjSxXb6CGeFF1izqh06k,25875
|
|
95
|
+
src/utils/hashing.py,sha256=T0GL1FWtG32QJID5RN3X-TzHHkolnNK6EfADvX6sdlE,2490
|
|
96
|
+
src/utils/uv_sync.py,sha256=N-KELdr6T4rTOueXfoEEKb3-kneZP60KmAMnReihXWc,2583
|
|
97
|
+
src/utils/validation.py,sha256=ab6OzB-GeZBMr413U10jG2FDiiWXCil-_e70T0BXwFE,1694
|
|
98
|
+
classifyre_cli-0.4.2.dist-info/METADATA,sha256=lpA2EoqBNPEKdy8B43TvX-qgNAqdgVPlDEM5MdVZhvo,4221
|
|
99
|
+
classifyre_cli-0.4.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
100
|
+
classifyre_cli-0.4.2.dist-info/entry_points.txt,sha256=edxllyMiIFfajgCZ3dTRACXYGC34ZX1AYTezMSR5E_8,45
|
|
101
|
+
classifyre_cli-0.4.2.dist-info/RECORD,,
|
src/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""CLI source code."""
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Detector package for identifying sensitive content."""
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import logging
|
|
5
|
+
import pkgutil
|
|
6
|
+
|
|
7
|
+
from ..models.generated_detectors import DetectorConfig
|
|
8
|
+
from .base import BaseDetector
|
|
9
|
+
|
|
10
|
+
__all__ = ["BaseDetector", "get_detector", "list_available_detectors"]
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
_registry: dict[str, "type[BaseDetector]"] = {}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _discover_detectors() -> None:
|
|
18
|
+
"""Auto-discover all BaseDetector subclasses in the detectors package."""
|
|
19
|
+
if _registry:
|
|
20
|
+
# Already discovered
|
|
21
|
+
return
|
|
22
|
+
|
|
23
|
+
logger.debug("Starting detector discovery...")
|
|
24
|
+
|
|
25
|
+
# Walk through all modules in the detectors package
|
|
26
|
+
for _loader, module_name, is_pkg in pkgutil.walk_packages(__path__, __name__ + "."):
|
|
27
|
+
if is_pkg:
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
# Import the module
|
|
32
|
+
module = importlib.import_module(module_name)
|
|
33
|
+
|
|
34
|
+
# Find all BaseDetector subclasses in the module
|
|
35
|
+
for attr_name in dir(module):
|
|
36
|
+
if attr_name.startswith("_"):
|
|
37
|
+
continue
|
|
38
|
+
attr = getattr(module, attr_name)
|
|
39
|
+
|
|
40
|
+
# Check if it's a BaseDetector subclass (but not BaseDetector itself)
|
|
41
|
+
if (
|
|
42
|
+
isinstance(attr, type)
|
|
43
|
+
and issubclass(attr, BaseDetector)
|
|
44
|
+
and attr is not BaseDetector
|
|
45
|
+
):
|
|
46
|
+
# Get detector name from class attribute or derive from class name
|
|
47
|
+
detector_name = getattr(attr, "detector_name", None)
|
|
48
|
+
if not detector_name:
|
|
49
|
+
# Fallback: derive from class name
|
|
50
|
+
detector_name = attr.__name__.replace("Detector", "").lower()
|
|
51
|
+
if str(detector_name).startswith("_"):
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
# Register the detector
|
|
55
|
+
if detector_name in _registry:
|
|
56
|
+
logger.warning(
|
|
57
|
+
f"Duplicate detector name '{detector_name}' - "
|
|
58
|
+
f"ignoring {module_name}.{attr_name}"
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
_registry[detector_name] = attr
|
|
62
|
+
logger.debug(
|
|
63
|
+
f"Registered detector '{detector_name}' from {module_name}.{attr_name}"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.error(f"Failed to import {module_name}: {e}")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_detector(detector_name: str, config: DetectorConfig | None = None) -> BaseDetector:
|
|
71
|
+
"""
|
|
72
|
+
Factory function to create a detector instance.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
detector_name: Name of the detector to create
|
|
76
|
+
config: Optional detector configuration
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Instance of the requested detector
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
ValueError: If detector_name is not found in registry
|
|
83
|
+
"""
|
|
84
|
+
# Ensure detectors are discovered
|
|
85
|
+
_discover_detectors()
|
|
86
|
+
|
|
87
|
+
if detector_name not in _registry:
|
|
88
|
+
available = ", ".join(sorted(_registry.keys()))
|
|
89
|
+
raise ValueError(f"Detector '{detector_name}' not found. Available: {available or 'none'}")
|
|
90
|
+
|
|
91
|
+
detector_class = _registry[detector_name]
|
|
92
|
+
return detector_class(config)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def list_available_detectors() -> list[str]:
|
|
96
|
+
"""
|
|
97
|
+
Return list of all registered detector names.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Sorted list of detector names
|
|
101
|
+
"""
|
|
102
|
+
# Ensure detectors are discovered
|
|
103
|
+
_discover_detectors()
|
|
104
|
+
|
|
105
|
+
return sorted(_registry.keys())
|
src/detectors/base.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Base detector interface for all detector implementations."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from ..models.generated_detectors import DetectorConfig
|
|
7
|
+
from ..models.generated_single_asset_scan_results import DetectionResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BaseDetector(ABC):
|
|
11
|
+
"""
|
|
12
|
+
Base interface for all detector implementations.
|
|
13
|
+
|
|
14
|
+
All detectors must implement the detect() method and get_supported_content_types().
|
|
15
|
+
The base class provides common functionality like redaction and metadata retrieval.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
detector_type: str = "base"
|
|
19
|
+
detector_name: str = "base"
|
|
20
|
+
|
|
21
|
+
def __init__(self, config: DetectorConfig | None = None):
|
|
22
|
+
self.config: DetectorConfig = config if config is not None else DetectorConfig()
|
|
23
|
+
self._initialized = False
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
async def detect(
|
|
27
|
+
self, content: str | bytes, content_type: str = "text/plain"
|
|
28
|
+
) -> list[DetectionResult]:
|
|
29
|
+
"""
|
|
30
|
+
Scan content and return findings.
|
|
31
|
+
|
|
32
|
+
Text detectors receive ``str``; image/binary detectors receive ``bytes``.
|
|
33
|
+
Implementations should return an empty list for unsupported content types.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
content: The content to scan — text (str) or binary (bytes)
|
|
37
|
+
content_type: MIME type of content (e.g., 'text/plain', 'image/jpeg')
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List of detection results
|
|
41
|
+
"""
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def get_supported_content_types(self) -> list[str]:
|
|
46
|
+
"""
|
|
47
|
+
Return list of supported MIME types.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of MIME type strings (e.g., ['text/plain', 'application/json'])
|
|
51
|
+
"""
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
def get_metadata(self) -> dict[str, Any]:
|
|
55
|
+
"""
|
|
56
|
+
Return detector metadata.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Dictionary with detector type, name, supported content types, etc.
|
|
60
|
+
"""
|
|
61
|
+
return {
|
|
62
|
+
"detector_type": self.detector_type,
|
|
63
|
+
"detector_name": self.detector_name,
|
|
64
|
+
"content_types": self.get_supported_content_types(),
|
|
65
|
+
"requires_gpu": self.requires_gpu(),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
def requires_gpu(self) -> bool:
|
|
69
|
+
"""
|
|
70
|
+
Return True if GPU is required for this detector.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Boolean indicating GPU requirement
|
|
74
|
+
"""
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
def redact(self, content: str, findings: list[DetectionResult]) -> str:
|
|
78
|
+
"""
|
|
79
|
+
Redact sensitive content based on findings.
|
|
80
|
+
|
|
81
|
+
Replaces each finding's matched_content with asterisks (*).
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
content: Original content
|
|
85
|
+
findings: List of detection results
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Redacted content string
|
|
89
|
+
"""
|
|
90
|
+
redacted = content
|
|
91
|
+
for finding in findings:
|
|
92
|
+
if finding.location is None:
|
|
93
|
+
continue
|
|
94
|
+
if finding.matched_content:
|
|
95
|
+
mask = "*" * len(finding.matched_content)
|
|
96
|
+
redacted = redacted.replace(finding.matched_content, mask)
|
|
97
|
+
return redacted
|