classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,167 @@
1
+ Metadata-Version: 2.4
2
+ Name: classifyre-cli
3
+ Version: 0.4.2
4
+ Summary: Classifyre CLI — scan and classify unstructured data sources
5
+ License: MIT
6
+ Keywords: data,ingestion,metadata,pii,secrets,unstructured
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: beautifulsoup4>=4.12.0
9
+ Requires-Dist: classifyre-schemas
10
+ Requires-Dist: email-validator>=2.3.0
11
+ Requires-Dist: en-core-web-sm
12
+ Requires-Dist: jsonschema>=4.26.0
13
+ Requires-Dist: lxml>=6.1.1
14
+ Requires-Dist: pydantic>=2.13.4
15
+ Requires-Dist: requests>=2.34.2
16
+ Description-Content-Type: text/markdown
17
+
18
+ # CLI Application
19
+
20
+ Python CLI for source extraction, detector execution, and batched output delivery.
21
+
22
+ ## Setup
23
+
24
+ ```bash
25
+ cd /unstructured/apps/cli
26
+ uv sync
27
+ # Optional if you want an activated shell instead of `uv run ...`:
28
+ source .venv/bin/activate
29
+ ```
30
+
31
+ Optional detector groups:
32
+
33
+ ```bash
34
+ uv sync --group detectors
35
+ # or specific groups: --group secrets --group pii --group threat ...
36
+ ```
37
+
38
+ ## Command Syntax
39
+
40
+ Use the thin wrapper:
41
+
42
+ ```bash
43
+ uv run main.py <command> <recipe.json> [options]
44
+ ```
45
+
46
+ Or direct module entrypoint:
47
+
48
+ ```bash
49
+ uv run python -m src.main <command> <recipe.json> [options]
50
+ ```
51
+
52
+ Commands:
53
+
54
+ - `test` - test source connection.
55
+ - `discover` - discover source resources.
56
+ - `extract` - run extraction and emit batched output.
57
+ - `sandbox` - run sandbox parsing/detectors for a local file.
58
+
59
+ ## Extract Output Model
60
+
61
+ Extraction always emits in batches.
62
+ Recipes do not contain `output` configuration; output is controlled by CLI flags and environment variables.
63
+
64
+ Output types:
65
+
66
+ - `console` - emits NDJSON envelopes to stdout.
67
+ - `file` - appends NDJSON envelopes to a file.
68
+ - `rest` - pushes batches to API endpoints and finalizes run.
69
+
70
+ Default behavior:
71
+
72
+ - If `source_id` is present (`--source-id` or `SOURCE_ID` env), default output is `rest`.
73
+ - Otherwise default output is `console`.
74
+ - Default batch size is `20`.
75
+
76
+ ## CLI Options
77
+
78
+ Global/common:
79
+
80
+ - `--debug` - enable debug logging.
81
+ - `--detectors-file <path>` - sandbox only.
82
+
83
+ Extract output options:
84
+
85
+ - `--output-type rest|file|console`
86
+ - `--output-batch-size <int>`
87
+ - `--output-rest-url <url>`
88
+ - `--output-file-path <path>`
89
+ - `--source-id <uuid>`
90
+ - `--runner-id <uuid>`
91
+ - `--managed-runner` (REST only; runner lifecycle managed by API orchestrator)
92
+
93
+ Environment fallbacks:
94
+
95
+ - `SOURCE_ID`, `RUNNER_ID`
96
+ - `CLASSIFYRE_OUTPUT_TYPE`, `CLASSIFYRE_OUTPUT_BATCH_SIZE`
97
+ - `CLASSIFYRE_OUTPUT_REST_URL`, `CLASSIFYRE_OUTPUT_REST_TIMEOUT_SEC`
98
+ - `CLASSIFYRE_OUTPUT_FILE_PATH`
99
+ - `API_URL` (fallback base URL for REST output)
100
+
101
+ ## Practical Examples
102
+
103
+ ### 1) Console output (quick local test)
104
+
105
+ ```bash
106
+ uv run main.py extract ./wordpress-recipe.json --output-type console --output-batch-size 1
107
+ ```
108
+
109
+ You will see NDJSON lines like:
110
+
111
+ - `{"event":"batch", ...}`
112
+ - `{"event":"finish", ...}`
113
+
114
+ ### 2) File output
115
+
116
+ ```bash
117
+ uv run main.py extract ./wordpress-recipe.json \
118
+ --output-type file \
119
+ --output-file-path /tmp/classifyre-assets.ndjson \
120
+ --output-batch-size 20
121
+ ```
122
+
123
+ ### 3) REST output (manual CLI to backend)
124
+
125
+ ```bash
126
+ uv run main.py extract ./wordpress-recipe.json \
127
+ --output-type rest \
128
+ --source-id <source_uuid>
129
+ ```
130
+
131
+ Notes:
132
+
133
+ - `--runner-id` optional for manual runs. If omitted, CLI creates external runner automatically.
134
+ - `--output-rest-url` is optional. If omitted, CLI uses `CLASSIFYRE_OUTPUT_REST_URL`, then `API_URL`, then `http://localhost:8000`.
135
+ - `--managed-runner` should be used only for API-orchestrated runs where runner already exists.
136
+
137
+ ### 4) REST output with explicit runner (managed/orchestrated style)
138
+
139
+ ```bash
140
+ uv run main.py extract ./wordpress-recipe.json \
141
+ --output-type rest \
142
+ --source-id <source_uuid> \
143
+ --runner-id <runner_uuid> \
144
+ --managed-runner
145
+ ```
146
+
147
+ ### 5) Full extract command with all output flags
148
+
149
+ ```bash
150
+ uv run main.py extract ./wordpress-recipe.json \
151
+ --output-type rest \
152
+ --output-batch-size 20 \
153
+ --output-rest-url http://localhost:8000 \
154
+ --output-file-path /tmp/classifyre-assets.ndjson \
155
+ --source-id <source_uuid> \
156
+ --runner-id <runner_uuid> \
157
+ --managed-runner
158
+ ```
159
+
160
+ Use `--output-file-path` only when `--output-type file`.
161
+
162
+ ## Dev Scripts
163
+
164
+ - `bun run dev` - run CLI quickly.
165
+ - `bun run lint` - ruff format/check.
166
+ - `bun run check-types` - mypy.
167
+ - `bun run test` - pytest suite.
@@ -0,0 +1,101 @@
1
+ src/__init__.py,sha256=a9znVQMuKTEdH6nB8kUs7jMmPV1U6IFXDzCU5RnzxFo,23
2
+ src/main.py,sha256=L82WFjaZk7QsXHUNyoOFyxAvHUUSyfP9978kAjgWPUA,24046
3
+ src/telemetry.py,sha256=fbSmuJ_t-kSthfIh3E7ESVkloH22kzl8lQKY370YtuU,3089
4
+ src/detectors/__init__.py,sha256=BWrUF_GtSsfs1nQMvG9UhOpUokqwJ0-5Dyg498VMs4E,3475
5
+ src/detectors/base.py,sha256=bXcPmc_rZG5ONWm_ZgGLT-QpLqK8zXsdAUsguMb3FMg,2998
6
+ src/detectors/config.py,sha256=CCrzp8Js8tQ1E34_Pl9ChY8JFjvUalXbnJJ-QWk54d4,1683
7
+ src/detectors/dependencies.py,sha256=C5FBnkBMWCnx34WkOrM7yTO8zcOV_zP0RbyvMtw8A20,3851
8
+ src/detectors/broken_links/__init__.py,sha256=maN50pAfC9IJGNUUIHGVM4YnjTAozRI43APLUCQrJcE,77
9
+ src/detectors/broken_links/detector.py,sha256=tyFpevrE21PZLj1GHGjlmnGq3c7Lc1OSvYcrSDkpMv8,9330
10
+ src/detectors/content/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ src/detectors/custom/__init__.py,sha256=HfTzYuQFd7XJbIAhWndqScxbbL0AS7vaOc7p3dkFW6U,296
12
+ src/detectors/custom/detector.py,sha256=SiDyoaRdd7W6JHnI_aGN_U0paF2Q_wenhxPiq-WoxNk,1653
13
+ src/detectors/custom/trainer.py,sha256=xS-eg8Ic6qr_rtMvSlrePFVwrzt9hwC56awSw6gTLOM,11358
14
+ src/detectors/custom/runners/__init__.py,sha256=KWSni4-O2SVfODFhRXWvml_PSg58lShHiBkaEhH8D5Y,1663
15
+ src/detectors/custom/runners/_base.py,sha256=VhQ2fQgJSqsrjj93aDv4_Zw-_kho6TA-pJivewGHnbk,5790
16
+ src/detectors/custom/runners/_factory.py,sha256=Q60Nw2vK6GX5VrrDqwmJllEar10XTgh9BgRUv4GcWi0,2093
17
+ src/detectors/custom/runners/_feature_extraction.py,sha256=U2-EWGR52hqMiLJ10R7Gx72D3MZXNad1w_TEbPJoRo8,5577
18
+ src/detectors/custom/runners/_gliner2.py,sha256=WpEpWHrIFoPkL4WGQ7XWzHekHQ6VWQsmw3Bqm_WFrWU,12680
19
+ src/detectors/custom/runners/_image_classification.py,sha256=43DHRwyA2hXv7UAiXVOzUGzuT8M5IGTE6T9WVuO90Ng,4000
20
+ src/detectors/custom/runners/_llm.py,sha256=viqeuuT_5eC5pqTXXkzhSy1zJyLAj5lzjzHBvg8PONw,731
21
+ src/detectors/custom/runners/_object_detection.py,sha256=C9kfeRbf9GTXu33l8ieQdn4EiGYRWbeRsy0Vq6V4MJQ,4683
22
+ src/detectors/custom/runners/_regex.py,sha256=lq9YA9v12iUXVoqgWWq0dHiPltbnfCBGW9fvYYiAugg,5219
23
+ src/detectors/custom/runners/_text_classification.py,sha256=HhXvXyWAevAn29f8rfneH7GuGFDgmrYHGOe9nf-Bpic,4633
24
+ src/detectors/pii/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ src/detectors/pii/detector.py,sha256=mLGBom6tLmeTW9ARKb0LDeeyz17-j98UPlBZnX02CH4,33060
26
+ src/detectors/secrets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
+ src/detectors/secrets/detector.py,sha256=aq8s0shOj7_fK9YcturY_OX8W8IAcZfg9Von3G1VFcw,13895
28
+ src/detectors/threat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
+ src/detectors/threat/code_security_detector.py,sha256=iUWvHZLwVmhRM_WrOBgG1A_mIsdkNHv5KqcCb5Ap0JU,6900
30
+ src/detectors/threat/yara_detector.py,sha256=ZmxdiDnSIUgSAj0EhVUlRdlcwl4VjOM1CQTIc_fObNE,6275
31
+ src/models/generated_detectors.py,sha256=BfDMufATPOj5-d2AzUxdjLoxUN-IDVMTe6EwOUExoTg,42847
32
+ src/models/generated_input.py,sha256=LeJovpInCNnW0pyliCYUjk947qc6nDgjknsT0QoyA6k,82805
33
+ src/models/generated_single_asset_scan_results.py,sha256=ou4pyGt_Cqg9to_8drCuS_IaE0qGDWJKUHUSIFhZq4E,7269
34
+ src/outputs/__init__.py,sha256=41SESY2nC1y4zF0CTWn1dnTh9AK7qMdRxmRSN_GuWss,126
35
+ src/outputs/base.py,sha256=mRFTwqkaWLqyS55OkdbO49yBnSHXlaIGd_bzLyNGj9E,1556
36
+ src/outputs/console.py,sha256=OwPP7fXCWFML2zB4d8AxPXikanebI8CrixbgsS3CMGM,1803
37
+ src/outputs/factory.py,sha256=bNkBFHEsm3oOFIh5O6Gd9N1NpFQIBvO3woern8fw20g,4918
38
+ src/outputs/file.py,sha256=6y585mxJS04MgTcMu5KPkJt_hm9qLjnbPEzD4MgrmI4,2594
39
+ src/outputs/rest.py,sha256=C5VQOIocaIK9ZBqjjR4ZW6Vei8zC0uoA2rpv-v9e-RQ,8572
40
+ src/pipeline/__init__.py,sha256=AouA0wkqDV1AU6P3fTaQTbmxZCBoNzXZzUWnpfmnm2Q,286
41
+ src/pipeline/content_provider.py,sha256=v8-qW6bQ4YZ4XJgLWCGMrSQVbEbG2zHZDS985F9SKOM,854
42
+ src/pipeline/detector_pipeline.py,sha256=5uJ_flXC_mNnkwtF5jyjBmYfWAWN4F4eHEMMQAZ5We0,27752
43
+ src/pipeline/parsed_content_provider.py,sha256=B5QyqMpx_sscOOXQSVu92d_S2FqLSPKizYhwiS8jymw,1940
44
+ src/sandbox/__init__.py,sha256=4t_V5Wf_2CGvtVMMblRMrAhl1T8uFpFxi-JZrelaers,123
45
+ src/sandbox/runner.py,sha256=21L-POOKEChmOLhwyc2rGBGDpUpNVlPiG-Xqfh6ajDQ,5397
46
+ src/sources/__init__.py,sha256=wYksO_0IcvlGXLouWapKyqzL0jG0El4rDDGRfwa4Zqk,3350
47
+ src/sources/atlassian_common.py,sha256=9jqI03SgfstG-5nK5uHF8m4p6eVO263CkpCSim8h1iQ,11853
48
+ src/sources/base.py,sha256=wTUGBvXqHYkPOICdWTPp31g42KCxXV74Db4QGjpKwnM,10163
49
+ src/sources/dependencies.py,sha256=ei5P3S1CxnNN-zKlChKwzyWEeAApYggPM_YoqrSNWaU,2699
50
+ src/sources/recipe_normalizer.py,sha256=JmTdLsqens7_-O1YG--VeAnbphCtfgFhLJ9tJugQHcc,5573
51
+ src/sources/tabular_utils.py,sha256=AVpYLuNeud8WkMtdoNzrr_akV9yKN3t51ilkzOnnkGA,5387
52
+ src/sources/azure_blob_storage/__init__.py,sha256=EAsp7N2Vpzc_S-ownm3-ieXVWZSIVXPPNiSlDGCyenI,81
53
+ src/sources/azure_blob_storage/source.py,sha256=1hUKphzuyK5ga1cviQTgXQLeulYL173m8t0YgS2TOag,5384
54
+ src/sources/confluence/__init__.py,sha256=VnD7SbvheqWwXS0bgxo9M561UOKgjENkwngwlLHJ500,69
55
+ src/sources/confluence/source.py,sha256=rykXW-d8M0dUWCZjTg3fj0ut0140R5_oCbMMN3_x6jE,27381
56
+ src/sources/databricks/__init__.py,sha256=yZZxIF-qhpt_W_zksFQoJGBqaIA40vQCyYLFSksGRFM,69
57
+ src/sources/databricks/source.py,sha256=bDWjcg0Fs1vyMtyvzR3svWZyMHHS5JFuCgTiaNwOOWY,45219
58
+ src/sources/google_cloud_storage/__init__.py,sha256=HSVm-DeqzcajfBkJdi1bsKC4r8N6c6ZD_ZQTpAVnqE0,85
59
+ src/sources/google_cloud_storage/source.py,sha256=YUV5o9TY_CsRgDcma_5lugc1ltbQ1h36EtaMr6lSkl4,4454
60
+ src/sources/hive/__init__.py,sha256=AYWKwROWOgaMDdRTakg5g3vqQqjECKXiozADPr4MeSM,57
61
+ src/sources/hive/source.py,sha256=JAyrM1DBq1QWqI-L9rGlFXyN8G1Y0y0WnjhNptVEPmU,25949
62
+ src/sources/jira/__init__.py,sha256=p3G96i94D6ZrdHKE6bzgVtYvO8h2-Aet4zERRYdJAyA,57
63
+ src/sources/jira/source.py,sha256=Ogq1NiJ41k2MrjyIuakOLdxKYexXsARVfweaV7Obgiw,22027
64
+ src/sources/mongodb/__init__.py,sha256=IcSTuVMB3JOXAjYpwFcFvRuc2a7m6xgZ1VG5AfM4eJU,63
65
+ src/sources/mongodb/source.py,sha256=ydqBGxrryDtNG8n5LGxl4XmTYWDYHTQy2QcFREc-YtY,19985
66
+ src/sources/mssql/__init__.py,sha256=Y2WuywS2C8lIsiwbikGKQQ9S3Pbp4X5cHyXiDNz8JPI,59
67
+ src/sources/mssql/source.py,sha256=oTRbBRJlG4K-Em7LZ36k5ShQqYbhAvPtpXXqCbk0cOA,40106
68
+ src/sources/mysql/__init__.py,sha256=V0hGtrFJLjGpc3iXzt2k8fH0WzAnyuuDG_2FogcwJBE,59
69
+ src/sources/mysql/source.py,sha256=5RnjPBMYagQo0K9wr0tVjvhP4zBAUF2hFPzVBXg8Cpg,30257
70
+ src/sources/neo4j/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
+ src/sources/neo4j/source.py,sha256=c230muZyihStMA1cOiSbNdP-v--DSwQR6018XryqTvM,18266
72
+ src/sources/object_storage/base.py,sha256=_NJaqOyNBs_wJAp_wUcxcuekTLgKcB0xRActrztxgGM,24477
73
+ src/sources/oracle/__init__.py,sha256=saYMyNR-UGniyVgLYS8u1PwPO5WQ_mEuq8WqYrl30rU,61
74
+ src/sources/oracle/source.py,sha256=6iBoK4hLQia6sEUgIhPxGQVLwH0w-p6skL7BZo6PRYI,36949
75
+ src/sources/postgresql/__init__.py,sha256=lHNYhsvG9fdnrr4b2Ya1mSdDdDRImmWOUEpQSXSu9dE,69
76
+ src/sources/postgresql/source.py,sha256=nBCk4fXitI7-z6DKtSXRVzHcWPspY253O6olpsP8t8g,30234
77
+ src/sources/powerbi/__init__.py,sha256=oDQQoV94r8xHqNjlCeFOULA-8Wyzr6tNCGznFEHEHF0,63
78
+ src/sources/powerbi/source.py,sha256=NjKEOkm4lym2O6Tu7F87vJXwy9eMkQoMfTblPgXmqz8,28906
79
+ src/sources/s3_compatible_storage/README.md,sha256=ud8h0cQ9OIwb2sw_3K3rBdTz2MCe2ratSKkcOGS1Lv4,2660
80
+ src/sources/s3_compatible_storage/__init__.py,sha256=AH0ZNp34GXHC-RXnXk3wQg9HPgck8yTlFF4ygjixj0k,87
81
+ src/sources/s3_compatible_storage/source.py,sha256=Jq9USIZ4wOZgaSwsAGl9YXPLrYHQMqroMjzdk7qno28,5578
82
+ src/sources/servicedesk/__init__.py,sha256=LZAcPnWYPnLvH-FpqH4x_AtipFNToOOn86qaJXG5ohM,71
83
+ src/sources/servicedesk/source.py,sha256=zzJffdcPlz51n8tOgdKHko1bOYCeU4Hq5BIM6g6HExc,23340
84
+ src/sources/slack/__init__.py,sha256=k-3Or9wLb48KXjDXqQzrUc7VenJtb5SucyoFhGKdiow,59
85
+ src/sources/slack/source.py,sha256=9cQI8e8fd1nzcZG2oaM72PxMB2vR5_Yf33IlCQGHEXA,18029
86
+ src/sources/snowflake/__init__.py,sha256=NGXckaCGaD6eHCIlHs4aErhOxCc3BbNDPvdbV_xx24g,67
87
+ src/sources/snowflake/source.py,sha256=fFen1NiCQOcDJ2SIldgeNgU1v8fjcW4EgfRbWsXyaSw,35565
88
+ src/sources/tableau/__init__.py,sha256=Di1Bh9s9SYrUYg4xZhhAhl5HfoTt0B6WQB5SxVrQhbY,63
89
+ src/sources/tableau/source.py,sha256=F-nX_DR1EO84OZMH2zCSfZgwCmibyI613kEB4YANnUM,30025
90
+ src/sources/wordpress/__init__.py,sha256=T7v2TGUYiKzARpYk9bynxChY0P9GDDxpdbBz34x9g6o,67
91
+ src/sources/wordpress/source.py,sha256=xs3dsc8FXXDJ5725eSaPvFnx2rN_g1520OWisceMlpA,21659
92
+ src/utils/__init__.py,sha256=qHkwUQnAipyD7orncfDbJCaQ47mc02h90FgWy9ad_2o,25
93
+ src/utils/content_extraction.py,sha256=Pqm_qMID2mQH1zc8099fNl-xIaQi3hOxQPWzK3yoKac,3082
94
+ src/utils/file_parser.py,sha256=3LkNbnknDnaYJ8f8y2KzNiyVjSxXb6CGeFF1izqh06k,25875
95
+ src/utils/hashing.py,sha256=T0GL1FWtG32QJID5RN3X-TzHHkolnNK6EfADvX6sdlE,2490
96
+ src/utils/uv_sync.py,sha256=N-KELdr6T4rTOueXfoEEKb3-kneZP60KmAMnReihXWc,2583
97
+ src/utils/validation.py,sha256=ab6OzB-GeZBMr413U10jG2FDiiWXCil-_e70T0BXwFE,1694
98
+ classifyre_cli-0.4.2.dist-info/METADATA,sha256=lpA2EoqBNPEKdy8B43TvX-qgNAqdgVPlDEM5MdVZhvo,4221
99
+ classifyre_cli-0.4.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
100
+ classifyre_cli-0.4.2.dist-info/entry_points.txt,sha256=edxllyMiIFfajgCZ3dTRACXYGC34ZX1AYTezMSR5E_8,45
101
+ classifyre_cli-0.4.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ classifyre = src.main:main
src/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """CLI source code."""
@@ -0,0 +1,105 @@
1
+ """Detector package for identifying sensitive content."""
2
+
3
+ import importlib
4
+ import logging
5
+ import pkgutil
6
+
7
+ from ..models.generated_detectors import DetectorConfig
8
+ from .base import BaseDetector
9
+
10
+ __all__ = ["BaseDetector", "get_detector", "list_available_detectors"]
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ _registry: dict[str, "type[BaseDetector]"] = {}
15
+
16
+
17
+ def _discover_detectors() -> None:
18
+ """Auto-discover all BaseDetector subclasses in the detectors package."""
19
+ if _registry:
20
+ # Already discovered
21
+ return
22
+
23
+ logger.debug("Starting detector discovery...")
24
+
25
+ # Walk through all modules in the detectors package
26
+ for _loader, module_name, is_pkg in pkgutil.walk_packages(__path__, __name__ + "."):
27
+ if is_pkg:
28
+ continue
29
+
30
+ try:
31
+ # Import the module
32
+ module = importlib.import_module(module_name)
33
+
34
+ # Find all BaseDetector subclasses in the module
35
+ for attr_name in dir(module):
36
+ if attr_name.startswith("_"):
37
+ continue
38
+ attr = getattr(module, attr_name)
39
+
40
+ # Check if it's a BaseDetector subclass (but not BaseDetector itself)
41
+ if (
42
+ isinstance(attr, type)
43
+ and issubclass(attr, BaseDetector)
44
+ and attr is not BaseDetector
45
+ ):
46
+ # Get detector name from class attribute or derive from class name
47
+ detector_name = getattr(attr, "detector_name", None)
48
+ if not detector_name:
49
+ # Fallback: derive from class name
50
+ detector_name = attr.__name__.replace("Detector", "").lower()
51
+ if str(detector_name).startswith("_"):
52
+ continue
53
+
54
+ # Register the detector
55
+ if detector_name in _registry:
56
+ logger.warning(
57
+ f"Duplicate detector name '{detector_name}' - "
58
+ f"ignoring {module_name}.{attr_name}"
59
+ )
60
+ else:
61
+ _registry[detector_name] = attr
62
+ logger.debug(
63
+ f"Registered detector '{detector_name}' from {module_name}.{attr_name}"
64
+ )
65
+
66
+ except Exception as e:
67
+ logger.error(f"Failed to import {module_name}: {e}")
68
+
69
+
70
+ def get_detector(detector_name: str, config: DetectorConfig | None = None) -> BaseDetector:
71
+ """
72
+ Factory function to create a detector instance.
73
+
74
+ Args:
75
+ detector_name: Name of the detector to create
76
+ config: Optional detector configuration
77
+
78
+ Returns:
79
+ Instance of the requested detector
80
+
81
+ Raises:
82
+ ValueError: If detector_name is not found in registry
83
+ """
84
+ # Ensure detectors are discovered
85
+ _discover_detectors()
86
+
87
+ if detector_name not in _registry:
88
+ available = ", ".join(sorted(_registry.keys()))
89
+ raise ValueError(f"Detector '{detector_name}' not found. Available: {available or 'none'}")
90
+
91
+ detector_class = _registry[detector_name]
92
+ return detector_class(config)
93
+
94
+
95
+ def list_available_detectors() -> list[str]:
96
+ """
97
+ Return list of all registered detector names.
98
+
99
+ Returns:
100
+ Sorted list of detector names
101
+ """
102
+ # Ensure detectors are discovered
103
+ _discover_detectors()
104
+
105
+ return sorted(_registry.keys())
src/detectors/base.py ADDED
@@ -0,0 +1,97 @@
1
+ """Base detector interface for all detector implementations."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any
5
+
6
+ from ..models.generated_detectors import DetectorConfig
7
+ from ..models.generated_single_asset_scan_results import DetectionResult
8
+
9
+
10
+ class BaseDetector(ABC):
11
+ """
12
+ Base interface for all detector implementations.
13
+
14
+ All detectors must implement the detect() method and get_supported_content_types().
15
+ The base class provides common functionality like redaction and metadata retrieval.
16
+ """
17
+
18
+ detector_type: str = "base"
19
+ detector_name: str = "base"
20
+
21
+ def __init__(self, config: DetectorConfig | None = None):
22
+ self.config: DetectorConfig = config if config is not None else DetectorConfig()
23
+ self._initialized = False
24
+
25
+ @abstractmethod
26
+ async def detect(
27
+ self, content: str | bytes, content_type: str = "text/plain"
28
+ ) -> list[DetectionResult]:
29
+ """
30
+ Scan content and return findings.
31
+
32
+ Text detectors receive ``str``; image/binary detectors receive ``bytes``.
33
+ Implementations should return an empty list for unsupported content types.
34
+
35
+ Args:
36
+ content: The content to scan — text (str) or binary (bytes)
37
+ content_type: MIME type of content (e.g., 'text/plain', 'image/jpeg')
38
+
39
+ Returns:
40
+ List of detection results
41
+ """
42
+ pass
43
+
44
+ @abstractmethod
45
+ def get_supported_content_types(self) -> list[str]:
46
+ """
47
+ Return list of supported MIME types.
48
+
49
+ Returns:
50
+ List of MIME type strings (e.g., ['text/plain', 'application/json'])
51
+ """
52
+ pass
53
+
54
+ def get_metadata(self) -> dict[str, Any]:
55
+ """
56
+ Return detector metadata.
57
+
58
+ Returns:
59
+ Dictionary with detector type, name, supported content types, etc.
60
+ """
61
+ return {
62
+ "detector_type": self.detector_type,
63
+ "detector_name": self.detector_name,
64
+ "content_types": self.get_supported_content_types(),
65
+ "requires_gpu": self.requires_gpu(),
66
+ }
67
+
68
+ def requires_gpu(self) -> bool:
69
+ """
70
+ Return True if GPU is required for this detector.
71
+
72
+ Returns:
73
+ Boolean indicating GPU requirement
74
+ """
75
+ return False
76
+
77
+ def redact(self, content: str, findings: list[DetectionResult]) -> str:
78
+ """
79
+ Redact sensitive content based on findings.
80
+
81
+ Replaces each finding's matched_content with asterisks (*).
82
+
83
+ Args:
84
+ content: Original content
85
+ findings: List of detection results
86
+
87
+ Returns:
88
+ Redacted content string
89
+ """
90
+ redacted = content
91
+ for finding in findings:
92
+ if finding.location is None:
93
+ continue
94
+ if finding.matched_content:
95
+ mask = "*" * len(finding.matched_content)
96
+ redacted = redacted.replace(finding.matched_content, mask)
97
+ return redacted
@@ -0,0 +1,3 @@
1
+ from .detector import BrokenLinksDetector
2
+
3
+ __all__ = ["BrokenLinksDetector"]