classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,1296 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: all_detectors.json
3
+
4
+ from __future__ import annotations
5
+
6
+ from enum import StrEnum
7
+ from typing import Any, Literal
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field, RootModel
10
+
11
+
12
+ class DetectorType(StrEnum):
13
+ """
14
+ Type of detector for content analysis
15
+ """
16
+
17
+ SECRETS = 'SECRETS'
18
+ PII = 'PII'
19
+ YARA = 'YARA'
20
+ BROKEN_LINKS = 'BROKEN_LINKS'
21
+ CODE_SECURITY = 'CODE_SECURITY'
22
+ CUSTOM = 'CUSTOM'
23
+
24
+
25
+ class DetectorCategory(StrEnum):
26
+ """
27
+ High-level detector category for filtering and governance
28
+ """
29
+
30
+ SECURITY = 'SECURITY'
31
+ PRIVACY = 'PRIVACY'
32
+ THREAT = 'THREAT'
33
+ CONTENT = 'CONTENT'
34
+ QUALITY = 'QUALITY'
35
+ FAIRNESS = 'FAIRNESS'
36
+ COMPLIANCE = 'COMPLIANCE'
37
+ CLASSIFICATION = 'CLASSIFICATION'
38
+
39
+
40
+ class SupportedAssetType(StrEnum):
41
+ """
42
+ Canonical output asset type the detector can process
43
+ """
44
+
45
+ TXT = 'TXT'
46
+ TABLE = 'TABLE'
47
+ IMAGE = 'IMAGE'
48
+ VIDEO = 'VIDEO'
49
+ AUDIO = 'AUDIO'
50
+ URL = 'URL'
51
+ BINARY = 'BINARY'
52
+ OTHER = 'OTHER'
53
+
54
+
55
+ class SourceAssetType(StrEnum):
56
+ """
57
+ Source types where detector execution is supported
58
+ """
59
+
60
+ WORDPRESS = 'WORDPRESS'
61
+ SLACK = 'SLACK'
62
+ S3_COMPATIBLE_STORAGE = 'S3_COMPATIBLE_STORAGE'
63
+ AZURE_BLOB_STORAGE = 'AZURE_BLOB_STORAGE'
64
+ GOOGLE_CLOUD_STORAGE = 'GOOGLE_CLOUD_STORAGE'
65
+ POSTGRESQL = 'POSTGRESQL'
66
+ MYSQL = 'MYSQL'
67
+ JIRA = 'JIRA'
68
+ CONFLUENCE = 'CONFLUENCE'
69
+ SERVICEDESK = 'SERVICEDESK'
70
+ DATABRICKS = 'DATABRICKS'
71
+ SNOWFLAKE = 'SNOWFLAKE'
72
+ MONGODB = 'MONGODB'
73
+ TABLEAU = 'TABLEAU'
74
+ MSSQL = 'MSSQL'
75
+ ORACLE = 'ORACLE'
76
+ HIVE = 'HIVE'
77
+ POWERBI = 'POWERBI'
78
+
79
+
80
+ class DetectorLifecycleStatus(StrEnum):
81
+ """
82
+ Implementation lifecycle status of a detector
83
+ """
84
+
85
+ active = 'active'
86
+ planned = 'planned'
87
+ experimental = 'experimental'
88
+ deprecated = 'deprecated'
89
+
90
+
91
+ class DetectorPriority(StrEnum):
92
+ """
93
+ Roadmap priority for rollout
94
+ """
95
+
96
+ P0 = 'P0'
97
+ P1 = 'P1'
98
+ P2 = 'P2'
99
+ P3 = 'P3'
100
+ P4 = 'P4'
101
+
102
+
103
+ class DetectorCatalogEntry(BaseModel):
104
+ """
105
+ Catalog metadata for one detector type
106
+ """
107
+
108
+ detector_type: DetectorType = Field(
109
+ ..., description='Canonical detector identifier'
110
+ )
111
+ lifecycle_status: DetectorLifecycleStatus
112
+ priority: DetectorPriority
113
+ categories: list[DetectorCategory] = Field(
114
+ ...,
115
+ description='Detector categories used for routing, reporting, and policy',
116
+ min_length=1,
117
+ )
118
+ supported_asset_types: list[SupportedAssetType] = Field(
119
+ ..., description='Asset modalities the detector can scan', min_length=1
120
+ )
121
+ supported_source_types: list[SourceAssetType] | None = Field(
122
+ None, description='Optional source-level compatibility list'
123
+ )
124
+ supported_mime_types: list[str] | None = Field(
125
+ None, description='Optional MIME type compatibility list'
126
+ )
127
+ recommended_model: str | None = Field(
128
+ None, description='Recommended model, engine, or package for this detector'
129
+ )
130
+ notes: str | None = Field(None, description='Implementation notes and constraints')
131
+
132
+
133
+ class DetectorCatalog(RootModel[list[DetectorCatalogEntry]]):
134
+ """
135
+ Detector capability catalog used for planning and runtime routing
136
+ """
137
+
138
+ root: list[DetectorCatalogEntry] = Field(
139
+ [
140
+ {
141
+ 'detector_type': 'SECRETS',
142
+ 'lifecycle_status': 'active',
143
+ 'priority': 'P0',
144
+ 'categories': ['SECURITY', 'COMPLIANCE'],
145
+ 'supported_asset_types': ['TXT', 'TABLE', 'URL'],
146
+ 'recommended_model': 'detect-secrets',
147
+ 'notes': 'Detects confidential credentials like API keys, tokens, or passwords that could lead to security breaches.',
148
+ },
149
+ {
150
+ 'detector_type': 'PII',
151
+ 'lifecycle_status': 'active',
152
+ 'priority': 'P0',
153
+ 'categories': ['PRIVACY', 'COMPLIANCE'],
154
+ 'supported_asset_types': ['TXT', 'TABLE', 'URL'],
155
+ 'recommended_model': 'presidio-analyzer',
156
+ 'notes': 'Identifies personal data (e.g., names, emails, IDs) that must be protected for privacy and compliance.',
157
+ },
158
+ {
159
+ 'detector_type': 'YARA',
160
+ 'lifecycle_status': 'active',
161
+ 'priority': 'P1',
162
+ 'categories': ['THREAT', 'SECURITY'],
163
+ 'supported_asset_types': ['TXT', 'TABLE', 'URL', 'BINARY'],
164
+ 'recommended_model': 'yara-python',
165
+ 'notes': 'Uses security rules to identify known malware patterns or suspicious file content.',
166
+ },
167
+ {
168
+ 'detector_type': 'BROKEN_LINKS',
169
+ 'lifecycle_status': 'active',
170
+ 'priority': 'P2',
171
+ 'categories': ['QUALITY'],
172
+ 'supported_asset_types': ['TXT', 'TABLE', 'URL'],
173
+ 'recommended_model': 'HTTP validation engine',
174
+ 'notes': 'Finds non-working or invalid links that reduce content quality and user trust.',
175
+ },
176
+ {
177
+ 'detector_type': 'CODE_SECURITY',
178
+ 'lifecycle_status': 'active',
179
+ 'priority': 'P3',
180
+ 'categories': ['SECURITY', 'THREAT'],
181
+ 'supported_asset_types': ['TXT', 'TABLE', 'OTHER'],
182
+ 'recommended_model': 'bandit',
183
+ 'notes': 'Identifies vulnerabilities or insecure patterns in source code (e.g., hardcoded secrets).',
184
+ },
185
+ {
186
+ 'detector_type': 'CUSTOM',
187
+ 'lifecycle_status': 'active',
188
+ 'priority': 'P0',
189
+ 'categories': ['CLASSIFICATION', 'COMPLIANCE'],
190
+ 'supported_asset_types': ['TXT', 'TABLE', 'URL', 'IMAGE'],
191
+ 'recommended_model': 'mDeBERTa-v3 + SetFit + GLiNER + HuggingFace transformers',
192
+ 'notes': 'User-defined rules and pipelines tailored to specific business needs. Supports regex, GLiNER2, LLM, text classification, image classification, feature extraction, and object detection pipelines.',
193
+ },
194
+ ],
195
+ description='Detector capability catalog used for planning and runtime routing',
196
+ validate_default=True,
197
+ )
198
+
199
+
200
+ class Severity(StrEnum):
201
+ """
202
+ Severity level of finding
203
+ """
204
+
205
+ critical = 'critical'
206
+ high = 'high'
207
+ medium = 'medium'
208
+ low = 'low'
209
+ info = 'info'
210
+
211
+
212
+ class Location(BaseModel):
213
+ """
214
+ Location of finding in content
215
+ """
216
+
217
+ line: int | None = Field(None, description='Line number (1-indexed)')
218
+ column: int | None = Field(None, description='Column number (1-indexed)')
219
+ start: int = Field(..., description='Start offset (0-indexed)')
220
+ end: int = Field(..., description='End offset (0-indexed)')
221
+ path: str | None = Field(None, description='File path or identifier')
222
+
223
+
224
+ class SecretsEnabledPattern(StrEnum):
225
+ """
226
+ Secrets detector pattern types. Each value maps to a detect-secrets plugin: artifactory=ArtifactoryDetector, aws=AWSKeyDetector, azure_storage=AzureStorageKeyDetector, basic_auth=BasicAuthDetector, cloudant=CloudantDetector, discord=DiscordBotTokenDetector, github=GitHubTokenDetector, gitlab=GitLabTokenDetector, high_entropy_base64=Base64HighEntropyString, high_entropy_hex=HexHighEntropyString, ibm_cloud_iam=IbmCloudIamDetector, ibm_cos_hmac=IbmCosHmacDetector, ip_public=IPPublicDetector, jwt=JwtTokenDetector, keyword=KeywordDetector, mailchimp=MailchimpDetector, npm=NpmDetector, openai=OpenAIDetector, private_key=PrivateKeyDetector, pypi=PypiTokenDetector, sendgrid=SendGridDetector, slack=SlackDetector, softlayer=SoftlayerDetector, square_oauth=SquareOAuthDetector, stripe=StripeDetector, telegram=TelegramBotTokenDetector, twilio=TwilioKeyDetector.
227
+ """
228
+
229
+ artifactory = 'artifactory'
230
+ aws = 'aws'
231
+ azure_storage = 'azure_storage'
232
+ basic_auth = 'basic_auth'
233
+ cloudant = 'cloudant'
234
+ discord = 'discord'
235
+ github = 'github'
236
+ gitlab = 'gitlab'
237
+ high_entropy_base64 = 'high_entropy_base64'
238
+ high_entropy_hex = 'high_entropy_hex'
239
+ ibm_cloud_iam = 'ibm_cloud_iam'
240
+ ibm_cos_hmac = 'ibm_cos_hmac'
241
+ ip_public = 'ip_public'
242
+ jwt = 'jwt'
243
+ keyword = 'keyword'
244
+ mailchimp = 'mailchimp'
245
+ npm = 'npm'
246
+ openai = 'openai'
247
+ private_key = 'private_key'
248
+ pypi = 'pypi'
249
+ sendgrid = 'sendgrid'
250
+ slack = 'slack'
251
+ softlayer = 'softlayer'
252
+ square_oauth = 'square_oauth'
253
+ stripe = 'stripe'
254
+ telegram = 'telegram'
255
+ twilio = 'twilio'
256
+
257
+
258
+ class PIIEnabledPattern(StrEnum):
259
+ """
260
+ Presidio entity types for PII detection. Global: CREDIT_CARD, CRYPTO, DATE_TIME, EMAIL_ADDRESS, IBAN_CODE, IP_ADDRESS, NRP, LOCATION, PERSON, PHONE_NUMBER, MEDICAL_LICENSE, URL. USA: US_BANK_NUMBER, US_DRIVER_LICENSE, US_ITIN, US_PASSPORT, US_SSN. UK: UK_NHS. Spain: ES_NIF, ES_NIE. Italy: IT_FISCAL_CODE, IT_DRIVER_LICENSE, IT_VAR_CODE, IT_PASSPORT, IT_IDENTITY_CARD. Singapore: SG_NRIC_FIN, SG_UEN. Australia: AU_ABN, AU_ACN, AU_TFN, AU_MEDICARE. India: IN_PAN, IN_AADHAAR, IN_VEHICLE_REGISTRATION, IN_VOTER. Finland: FI_PERSONAL_IDENTITY_CODE. Poland: PL_PESEL. DACH: AT_SVNR, CH_AHV, DE_TAX_ID, EU_NATIONAL_ID.
261
+ """
262
+
263
+ CREDIT_CARD = 'CREDIT_CARD'
264
+ CRYPTO = 'CRYPTO'
265
+ DATE_TIME = 'DATE_TIME'
266
+ EMAIL_ADDRESS = 'EMAIL_ADDRESS'
267
+ IBAN_CODE = 'IBAN_CODE'
268
+ IP_ADDRESS = 'IP_ADDRESS'
269
+ NRP = 'NRP'
270
+ LOCATION = 'LOCATION'
271
+ PERSON = 'PERSON'
272
+ PHONE_NUMBER = 'PHONE_NUMBER'
273
+ MEDICAL_LICENSE = 'MEDICAL_LICENSE'
274
+ URL = 'URL'
275
+ US_BANK_NUMBER = 'US_BANK_NUMBER'
276
+ US_DRIVER_LICENSE = 'US_DRIVER_LICENSE'
277
+ US_ITIN = 'US_ITIN'
278
+ US_PASSPORT = 'US_PASSPORT'
279
+ US_SSN = 'US_SSN'
280
+ UK_NHS = 'UK_NHS'
281
+ ES_NIF = 'ES_NIF'
282
+ ES_NIE = 'ES_NIE'
283
+ IT_FISCAL_CODE = 'IT_FISCAL_CODE'
284
+ IT_DRIVER_LICENSE = 'IT_DRIVER_LICENSE'
285
+ IT_VAR_CODE = 'IT_VAR_CODE'
286
+ IT_PASSPORT = 'IT_PASSPORT'
287
+ IT_IDENTITY_CARD = 'IT_IDENTITY_CARD'
288
+ SG_NRIC_FIN = 'SG_NRIC_FIN'
289
+ SG_UEN = 'SG_UEN'
290
+ AU_ABN = 'AU_ABN'
291
+ AU_ACN = 'AU_ACN'
292
+ AU_TFN = 'AU_TFN'
293
+ AU_MEDICARE = 'AU_MEDICARE'
294
+ IN_PAN = 'IN_PAN'
295
+ IN_AADHAAR = 'IN_AADHAAR'
296
+ IN_VEHICLE_REGISTRATION = 'IN_VEHICLE_REGISTRATION'
297
+ IN_VOTER = 'IN_VOTER'
298
+ FI_PERSONAL_IDENTITY_CODE = 'FI_PERSONAL_IDENTITY_CODE'
299
+ PL_PESEL = 'PL_PESEL'
300
+ AT_SVNR = 'AT_SVNR'
301
+ CH_AHV = 'CH_AHV'
302
+ DE_TAX_ID = 'DE_TAX_ID'
303
+ EU_NATIONAL_ID = 'EU_NATIONAL_ID'
304
+
305
+
306
+ class PIIRecognizerPattern(BaseModel):
307
+ """
308
+ Regex pattern entry for a custom Presidio recognizer
309
+ """
310
+
311
+ model_config = ConfigDict(
312
+ extra='forbid',
313
+ )
314
+ name: str = Field(..., description='Human-readable name for this pattern')
315
+ regex: str = Field(..., description='Regular expression to match the entity')
316
+ score: float = Field(
317
+ ...,
318
+ description='Confidence score assigned when this pattern matches (0-1)',
319
+ ge=0.0,
320
+ le=1.0,
321
+ )
322
+
323
+
324
+ class Patterns(RootModel[list[PIIRecognizerPattern]]):
325
+ root: list[PIIRecognizerPattern] = Field(
326
+ None, description='Regex patterns for this recognizer', min_length=1
327
+ )
328
+
329
+
330
+ class DenyList(RootModel[list[str]]):
331
+ root: list[str] = Field(
332
+ None,
333
+ description='Exact-match deny-list terms for this recognizer',
334
+ min_length=1,
335
+ )
336
+
337
+
338
+ class PIICustomRecognizer(BaseModel):
339
+ """
340
+ Ad-hoc Presidio recognizer added at runtime. Supports regex patterns, deny-list terms, or both. The recognizer is registered in the analyzer engine and applies to all scans with this config.
341
+ """
342
+
343
+ model_config = ConfigDict(
344
+ extra='forbid',
345
+ )
346
+ name: str = Field(..., description='Unique name for this recognizer')
347
+ supported_entity: str = Field(
348
+ ...,
349
+ description='Entity label produced when this recognizer fires (e.g. MY_EMPLOYEE_ID)',
350
+ )
351
+ supported_language: str | None = Field(
352
+ 'en', description='BCP-47 language code this recognizer applies to'
353
+ )
354
+ patterns: Patterns | None = Field(
355
+ None, description='Regex patterns for this recognizer'
356
+ )
357
+ deny_list: DenyList | None = Field(
358
+ None, description='Exact-match deny-list terms for this recognizer'
359
+ )
360
+ context: list[str] | None = Field(
361
+ None,
362
+ description="Context words that boost the score when found near a match (e.g. ['zip', 'code'])",
363
+ )
364
+
365
+
366
+ class DetectorConfig(BaseModel):
367
+ """
368
+ Base configuration for detector
369
+ """
370
+
371
+
372
+ class EnabledPatterns(RootModel[list[SecretsEnabledPattern]]):
373
+ root: list[SecretsEnabledPattern] = Field(
374
+ None,
375
+ description='Subset of detect-secrets plugins to enable. When null all supported plugins are active.',
376
+ min_length=1,
377
+ )
378
+
379
+
380
+ class EntropyLimitBase64(RootModel[float]):
381
+ root: float = Field(
382
+ None,
383
+ description='Entropy threshold for Base64HighEntropyString (0-8). Defaults to detect-secrets built-in of 4.5 when null. Lower values catch more secrets but increase false positives.',
384
+ ge=0.0,
385
+ le=8.0,
386
+ )
387
+
388
+
389
+ class EntropyLimitHex(RootModel[float]):
390
+ root: float = Field(
391
+ None,
392
+ description='Entropy threshold for HexHighEntropyString (0-8). Defaults to detect-secrets built-in of 3.0 when null. Lower values catch more secrets but increase false positives.',
393
+ ge=0.0,
394
+ le=8.0,
395
+ )
396
+
397
+
398
+ class SecretsDetectorConfig(DetectorConfig):
399
+ """
400
+ Configuration for secrets detector powered by detect-secrets
401
+ """
402
+
403
+ enabled_patterns: EnabledPatterns | None = Field(
404
+ None,
405
+ description='Subset of detect-secrets plugins to enable. When null all supported plugins are active.',
406
+ )
407
+ entropy_limit_base64: EntropyLimitBase64 | None = Field(
408
+ None,
409
+ description='Entropy threshold for Base64HighEntropyString (0-8). Defaults to detect-secrets built-in of 4.5 when null. Lower values catch more secrets but increase false positives.',
410
+ )
411
+ entropy_limit_hex: EntropyLimitHex | None = Field(
412
+ None,
413
+ description='Entropy threshold for HexHighEntropyString (0-8). Defaults to detect-secrets built-in of 3.0 when null. Lower values catch more secrets but increase false positives.',
414
+ )
415
+ confidence_threshold: float | None = Field(
416
+ 0.7,
417
+ description='Minimum confidence score to report a finding (0-1)',
418
+ ge=0.0,
419
+ le=1.0,
420
+ )
421
+ max_findings: int | None = Field(
422
+ None, description='Maximum number of findings to return per asset'
423
+ )
424
+ severity_threshold: Severity | None = Field(
425
+ None,
426
+ description='Minimum severity level to include in results. Findings below this threshold are suppressed.',
427
+ )
428
+
429
+
430
+ class MaxLength(RootModel[int]):
431
+ root: int = Field(
432
+ None,
433
+ description="Override spaCy's nlp.max_length (default 1,000,000 chars). Set higher than your longest expected input to avoid the E088 error. Prefer chunk_size for very large texts.",
434
+ ge=1,
435
+ )
436
+
437
+
438
+ class ChunkSize(RootModel[int]):
439
+ root: int = Field(
440
+ None,
441
+ description='Split text into chunks of this many characters before analysis. Findings from all chunks are merged with corrected offsets. When null the full text is passed as-is (subject to max_length).',
442
+ ge=1,
443
+ )
444
+
445
+
446
+ class ChunkOverlap(RootModel[int]):
447
+ root: int = Field(
448
+ 0,
449
+ description='Character overlap between consecutive chunks. Helps detect entities that span a chunk boundary.',
450
+ ge=0,
451
+ )
452
+
453
+
454
+ class PIIDetectorConfig(DetectorConfig):
455
+ """
456
+ Configuration for PII detector powered by Microsoft Presidio
457
+ """
458
+
459
+ enabled_patterns: list[PIIEnabledPattern] | None = Field(
460
+ None,
461
+ description='Presidio entity types to detect. When null, all supported entities are enabled. Use PIIEnabledPattern values (e.g. EMAIL_ADDRESS, US_SSN, CREDIT_CARD).',
462
+ )
463
+ language: str | None = Field(
464
+ 'en', description='BCP-47 language code for NER models (e.g. en, de, es)'
465
+ )
466
+ spacy_model: str | None = Field(
467
+ None,
468
+ description='spaCy model to load (e.g. en_core_web_sm, en_core_web_lg). Defaults to en_core_web_sm when null.',
469
+ )
470
+ spacy_model_url: str | None = Field(
471
+ None,
472
+ description='Wheel URL for the spaCy model. When set and the model is not installed, the CLI installs it at runtime.',
473
+ )
474
+ custom_recognizers: list[PIICustomRecognizer] | None = Field(
475
+ None,
476
+ description='Ad-hoc recognizers added to the Presidio registry at runtime. Each entry defines a regex-pattern or deny-list recognizer for a custom entity type.',
477
+ )
478
+ max_length: MaxLength | None = Field(
479
+ None,
480
+ description="Override spaCy's nlp.max_length (default 1,000,000 chars). Set higher than your longest expected input to avoid the E088 error. Prefer chunk_size for very large texts.",
481
+ )
482
+ chunk_size: ChunkSize | None = Field(
483
+ None,
484
+ description='Split text into chunks of this many characters before analysis. Findings from all chunks are merged with corrected offsets. When null the full text is passed as-is (subject to max_length).',
485
+ )
486
+ chunk_overlap: ChunkOverlap | None = Field(
487
+ 0,
488
+ description='Character overlap between consecutive chunks. Helps detect entities that span a chunk boundary.',
489
+ validate_default=True,
490
+ )
491
+ confidence_threshold: float | None = Field(
492
+ 0.7,
493
+ description='Minimum Presidio confidence score to report a finding (0-1)',
494
+ ge=0.0,
495
+ le=1.0,
496
+ )
497
+ max_findings: int | None = Field(
498
+ None, description='Maximum number of findings to return per asset'
499
+ )
500
+
501
+
502
+ class YaraRuleConfig(BaseModel):
503
+ """
504
+ A single YARA rule definition. Strings are assigned identifiers $s0, $s1, … in order and can be referenced in the condition by index or via 'any of them' / 'N of ($s*)'.
505
+ """
506
+
507
+ name: str = Field(
508
+ ...,
509
+ description='Rule identifier — letters, digits, and underscores only. Must be unique within a config.',
510
+ pattern='^[A-Za-z][A-Za-z0-9_]*$',
511
+ )
512
+ description: str | None = Field(
513
+ None, description='Human-readable description of what the rule detects.'
514
+ )
515
+ severity: Severity = Field(
516
+ ..., description='Severity level emitted when the rule fires.'
517
+ )
518
+ category: str | None = Field(
519
+ None,
520
+ description="Free-form tag for grouping findings, e.g. 'secrets', 'malware', 'supply_chain'.",
521
+ )
522
+ strings: list[str] = Field(
523
+ ...,
524
+ description='YARA string patterns with optional modifiers. Assigned identifiers $s0, $s1, … in declaration order. Supported formats: regex (/pattern/ [ascii|wide|nocase|fullword]), literal ("text" [modifiers]), hex ({ HH HH … }).',
525
+ )
526
+ condition: str = Field(
527
+ ...,
528
+ description="YARA condition expression, e.g. 'any of them', '2 of ($s*)', '$s0 and $s1'.",
529
+ )
530
+
531
+
532
+ class ThreatDetectorConfig(DetectorConfig):
533
+ """
534
+ YARA-based threat detector. Compile one or more structured rule objects into a live YARA ruleset that is scanned against extracted asset content. Use the bundled examples as starting points and compose additional rules as needed.
535
+ """
536
+
537
+ rules: list[YaraRuleConfig] | None = Field(
538
+ None,
539
+ description='Rules to compile and run. When null or empty no scan is performed.',
540
+ )
541
+ timeout: int | None = Field(
542
+ 60,
543
+ description='Maximum seconds to spend scanning a single asset before aborting.',
544
+ )
545
+ confidence_threshold: float | None = Field(
546
+ 0.7,
547
+ description='Minimum confidence score to report a finding (0-1). YARA confidence is computed from match count.',
548
+ ge=0.0,
549
+ le=1.0,
550
+ )
551
+ max_findings: int | None = Field(
552
+ None, description='Maximum number of findings to return per asset'
553
+ )
554
+ severity_threshold: Severity | None = Field(
555
+ None,
556
+ description='Minimum severity level to include in results. Findings below this threshold are suppressed.',
557
+ )
558
+
559
+
560
+ class BrokenLinksDetectorConfig(DetectorConfig):
561
+ """
562
+ Configuration for broken links detector
563
+ """
564
+
565
+ max_findings: int | None = Field(
566
+ None, description='Maximum number of broken link findings to return per asset'
567
+ )
568
+
569
+
570
+ class CustomDetectorMethod(StrEnum):
571
+ """
572
+ Execution method for custom detector logic
573
+ """
574
+
575
+ RULESET = 'RULESET'
576
+ CLASSIFIER = 'CLASSIFIER'
577
+ ENTITY = 'ENTITY'
578
+ PIPELINE = 'PIPELINE'
579
+
580
+
581
+ class CustomRegexRule(BaseModel):
582
+ model_config = ConfigDict(
583
+ extra='forbid',
584
+ )
585
+ id: str = Field(..., description='Stable ID for this regex rule')
586
+ name: str = Field(..., description='Display name for this regex rule')
587
+ pattern: str = Field(..., description='Regular expression pattern')
588
+ flags: str | None = Field('', description='Regex flags (for example i, m, s)')
589
+ severity: Severity | None = None
590
+
591
+
592
+ class CustomKeywordRule(BaseModel):
593
+ model_config = ConfigDict(
594
+ extra='forbid',
595
+ )
596
+ id: str = Field(..., description='Stable ID for this keyword rule')
597
+ name: str = Field(..., description='Display name for this keyword rule')
598
+ keywords: list[str] = Field(..., description='Keyword set to match', min_length=1)
599
+ case_sensitive: bool | None = Field(
600
+ False, description='Whether keyword matching is case-sensitive'
601
+ )
602
+ severity: Severity | None = None
603
+
604
+
605
+ class Type(StrEnum):
606
+ string = 'string'
607
+ number = 'number'
608
+ boolean = 'boolean'
609
+ list_string_ = 'list[string]'
610
+ list_number_ = 'list[number]'
611
+
612
+
613
+ class Aggregate(StrEnum):
614
+ """
615
+ How to aggregate multiple matches
616
+ """
617
+
618
+ first = 'first'
619
+ last = 'last'
620
+ list = 'list'
621
+ join = 'join'
622
+ count = 'count'
623
+
624
+
625
+ class CustomExtractorField(BaseModel):
626
+ """
627
+ One output field in the extraction schema
628
+ """
629
+
630
+ model_config = ConfigDict(
631
+ extra='forbid',
632
+ )
633
+ name: str = Field(
634
+ ..., description='Output field name — becomes a key in extracted_data JSON'
635
+ )
636
+ description: str | None = Field(
637
+ None, description='Human-readable hint for what this field captures'
638
+ )
639
+ type: Type | None = 'string'
640
+ entity_label: str | None = Field(
641
+ None,
642
+ description='GLiNER2 schema label used for extraction (ENTITY and CLASSIFIER methods)',
643
+ )
644
+ regex_pattern: str | None = Field(
645
+ None,
646
+ description='Regex with one named capture group (?P<value>...) for RULESET method',
647
+ )
648
+ regex_flags: str | None = Field(
649
+ 'i', description='Regex flags: i=case-insensitive, m=multiline, s=dotall'
650
+ )
651
+ aggregate: Aggregate | None = Field(
652
+ 'list', description='How to aggregate multiple matches'
653
+ )
654
+ join_separator: str | None = ', '
655
+ min_confidence: float | None = Field(
656
+ 0.4, description='Minimum GLiNER confidence for this field', ge=0.0, le=1.0
657
+ )
658
+ required: bool | None = Field(
659
+ False, description='If true, skip saving extraction when this field is empty'
660
+ )
661
+
662
+
663
+ class CustomExtractorConfig(BaseModel):
664
+ """
665
+ Structured extraction — runs after detection fires on the same content
666
+ """
667
+
668
+ model_config = ConfigDict(
669
+ extra='forbid',
670
+ )
671
+ enabled: bool | None = True
672
+ fields: list[CustomExtractorField] = Field(..., min_length=1)
673
+ gliner_model: str | None = 'fastino/gliner2-base-v1'
674
+ content_limit: int | None = Field(
675
+ 4000,
676
+ description='Chars of content to pass to extractor (classifier matched_content is only 320 chars)',
677
+ ge=320,
678
+ le=8192,
679
+ )
680
+
681
+
682
+ class CustomRulesetConfig(BaseModel):
683
+ model_config = ConfigDict(
684
+ extra='forbid',
685
+ )
686
+ regex_rules: list[CustomRegexRule] | None = Field([], validate_default=True)
687
+ keyword_rules: list[CustomKeywordRule] | None = Field([], validate_default=True)
688
+
689
+
690
+ class CustomClassifierLabel(BaseModel):
691
+ model_config = ConfigDict(
692
+ extra='forbid',
693
+ )
694
+ id: str
695
+ name: str
696
+ description: str | None = None
697
+
698
+
699
+ class CustomClassifierTrainingExample(BaseModel):
700
+ model_config = ConfigDict(
701
+ extra='forbid',
702
+ )
703
+ text: str
704
+ label: str
705
+ accepted: bool | None = True
706
+ source: str | None = Field(
707
+ 'editor', description='Origin of this example (editor/feedback/import)'
708
+ )
709
+
710
+
711
+ class CustomClassifierConfig(BaseModel):
712
+ model_config = ConfigDict(
713
+ extra='forbid',
714
+ )
715
+ labels: list[CustomClassifierLabel] | None = Field([], validate_default=True)
716
+ zero_shot_model: str | None = 'MoritzLaurer/mDeBERTa-v3-base-mnli-xnli'
717
+ hypothesis_template: str | None = 'This text contains {}.'
718
+ training_examples: list[CustomClassifierTrainingExample] | None = Field(
719
+ [], validate_default=True
720
+ )
721
+ min_examples_per_label: int | None = Field(8, ge=1)
722
+ setfit_model: str | None = (
723
+ 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
724
+ )
725
+
726
+
727
+ class CustomEntityConfig(BaseModel):
728
+ model_config = ConfigDict(
729
+ extra='forbid',
730
+ )
731
+ entity_labels: list[str] | None = []
732
+ entity_descriptions: dict[str, str] | None = Field(
733
+ {}, description='Optional GLiNER2 schema descriptions keyed by entity label'
734
+ )
735
+ model: str | None = 'fastino/gliner2-base-v1'
736
+
737
+
738
+ class Skip(RootModel[str]):
739
+ root: str = Field(..., pattern='^B[0-9]+$')
740
+
741
+
742
+ class Skips(RootModel[list[Skip]]):
743
+ root: list[Skip] = Field(
744
+ None,
745
+ description="Bandit test IDs to skip (e.g. ['B101', 'B105']). When null all tests run. Takes precedence over tests when both are set.",
746
+ min_length=1,
747
+ )
748
+
749
+
750
+ class Test(RootModel[str]):
751
+ root: str = Field(..., pattern='^B[0-9]+$')
752
+
753
+
754
+ class Tests(RootModel[list[Test]]):
755
+ root: list[Test] = Field(
756
+ None,
757
+ description="Explicit list of Bandit test IDs to run (e.g. ['B102', 'B301']). When null all tests (minus skips) run.",
758
+ min_length=1,
759
+ )
760
+
761
+
762
+ class CodeSecurityDetectorConfig(DetectorConfig):
763
+ """
764
+ Configuration for the code security detector powered by Bandit static analysis. Use skips/tests to control which Bandit checks run.
765
+ """
766
+
767
+ skips: Skips | None = Field(
768
+ None,
769
+ description="Bandit test IDs to skip (e.g. ['B101', 'B105']). When null all tests run. Takes precedence over tests when both are set.",
770
+ )
771
+ tests: Tests | None = Field(
772
+ None,
773
+ description="Explicit list of Bandit test IDs to run (e.g. ['B102', 'B301']). When null all tests (minus skips) run.",
774
+ )
775
+ severity_threshold: Severity | None = Field(
776
+ None,
777
+ description='Minimum Bandit issue severity to report. Findings below this threshold are suppressed.',
778
+ )
779
+ confidence_threshold: float | None = Field(
780
+ 0.7,
781
+ description='Minimum confidence score to report a finding (0-1)',
782
+ ge=0.0,
783
+ le=1.0,
784
+ )
785
+ max_findings: int | None = Field(
786
+ None, description='Maximum number of findings to return per asset'
787
+ )
788
+
789
+
790
+ class GenericDetectorConfig(DetectorConfig):
791
+ """
792
+ Generic config for detectors without specialised parameters.
793
+ """
794
+
795
+ confidence_threshold: float | None = Field(
796
+ 0.7,
797
+ description='Minimum confidence score to report a finding (0-1)',
798
+ ge=0.0,
799
+ le=1.0,
800
+ )
801
+ max_findings: int | None = Field(
802
+ None, description='Maximum number of findings to return per asset'
803
+ )
804
+
805
+
806
+ class PipelineEntityDefinition(BaseModel):
807
+ model_config = ConfigDict(
808
+ extra='forbid',
809
+ )
810
+ description: str | None = ''
811
+ required: bool | None = False
812
+
813
+
814
+ class PipelineModelConfig(BaseModel):
815
+ model_config = ConfigDict(
816
+ extra='forbid',
817
+ )
818
+ path: str | None = None
819
+ name: str | None = None
820
+
821
+
822
+ class PipelineClassificationDefinition(BaseModel):
823
+ model_config = ConfigDict(
824
+ extra='forbid',
825
+ )
826
+ labels: list[str] | None = []
827
+ multi_label: bool | None = False
828
+
829
+
830
+ class PipelineValidationRule(BaseModel):
831
+ model_config = ConfigDict(
832
+ extra='forbid',
833
+ )
834
+ field: str | None = ''
835
+ type: str | None = 'regex'
836
+ pattern: str | None = None
837
+
838
+
839
+ class PipelineValidationConfig(BaseModel):
840
+ model_config = ConfigDict(
841
+ extra='forbid',
842
+ )
843
+ confidence_threshold: float | None = 0.7
844
+ rules: list[PipelineValidationRule] | None = None
845
+
846
+
847
+ class PipelineResult(BaseModel):
848
+ model_config = ConfigDict(
849
+ extra='forbid',
850
+ )
851
+ entities: dict[str, list[dict[str, Any]]] | None = None
852
+ classification: dict[str, dict[str, Any]] | None = None
853
+ metadata: dict[str, Any] | None = None
854
+
855
+
856
+ class Severity1(StrEnum):
857
+ """
858
+ Severity level assigned to findings from this pattern. When omitted, defaults to high (confidence is always 1.0 for regex).
859
+ """
860
+
861
+ critical = 'critical'
862
+ high = 'high'
863
+ medium = 'medium'
864
+ low = 'low'
865
+ info = 'info'
866
+
867
+
868
+ class RegexPatternDefinition(BaseModel):
869
+ model_config = ConfigDict(
870
+ extra='forbid',
871
+ )
872
+ pattern: str = Field(
873
+ ..., description='Regular expression pattern (RE2 syntax recommended)'
874
+ )
875
+ flags: Any | None = Field(
876
+ None,
877
+ description='Legacy integer flags (e.g. re.IGNORECASE). Prefer the boolean fields below instead.',
878
+ )
879
+ description: str | None = Field(
880
+ None, description='Human-readable description of the pattern'
881
+ )
882
+ severity: Severity1 | None = Field(
883
+ None,
884
+ description='Severity level assigned to findings from this pattern. When omitted, defaults to high (confidence is always 1.0 for regex).',
885
+ )
886
+ case_sensitive: bool | None = Field(
887
+ True,
888
+ description='Whether matching is case-sensitive. Set to false for case-insensitive matching.',
889
+ )
890
+ dot_nl: bool | None = Field(
891
+ False,
892
+ description='Whether the dot (.) metacharacter matches newline characters.',
893
+ )
894
+ literal: bool | None = Field(
895
+ False,
896
+ description='Treat the pattern as a literal string instead of a regular expression.',
897
+ )
898
+ longest_match: bool | None = Field(
899
+ False,
900
+ description='Prefer the longest possible match instead of the first match (RE2 only).',
901
+ )
902
+ max_mem: int | None = Field(
903
+ None,
904
+ description='Maximum memory (bytes) for the RE2 automaton. RE2 only; ignored with stdlib fallback.',
905
+ ge=1,
906
+ )
907
+ group: int | None = Field(
908
+ 0,
909
+ description='Capture group index to extract as matched content. 0 = entire match (default).',
910
+ ge=0,
911
+ )
912
+
913
+
914
+ class PipelineSeverityRule(BaseModel):
915
+ """
916
+ Maps a predicted label to a severity level. Pattern is matched case-insensitively as a substring of the label.
917
+ """
918
+
919
+ model_config = ConfigDict(
920
+ extra='forbid',
921
+ )
922
+ pattern: str = Field(
923
+ ...,
924
+ description="Case-insensitive substring matched against the predicted label (e.g. 'spam', 'nsfw', 'person').",
925
+ )
926
+ severity: Severity
927
+
928
+
929
+ class Type1(StrEnum):
930
+ GLINER2 = 'GLINER2'
931
+
932
+
933
+ class GLiNER2PipelineSchema(BaseModel):
934
+ model_config = ConfigDict(
935
+ extra='forbid',
936
+ )
937
+ type: Literal['GLINER2'] = 'GLINER2'
938
+ entities: dict[str, PipelineEntityDefinition] | None = None
939
+ classification: dict[str, PipelineClassificationDefinition] | None = None
940
+ model: PipelineModelConfig | None = None
941
+ validation: PipelineValidationConfig | None = None
942
+
943
+
944
+ class Type2(StrEnum):
945
+ REGEX = 'REGEX'
946
+
947
+
948
+ class RegexPipelineSchema(BaseModel):
949
+ model_config = ConfigDict(
950
+ extra='forbid',
951
+ )
952
+ type: Literal['REGEX'] = 'REGEX'
953
+ patterns: dict[str, RegexPatternDefinition] | None = None
954
+ validation: PipelineValidationConfig | None = None
955
+
956
+
957
+ class Type3(StrEnum):
958
+ LLM = 'LLM'
959
+
960
+
961
+ class LLMPipelineSchema(BaseModel):
962
+ model_config = ConfigDict(
963
+ extra='forbid',
964
+ )
965
+ type: Literal['LLM'] = 'LLM'
966
+
967
+
968
+ class Type4(StrEnum):
969
+ TEXT_CLASSIFICATION = 'TEXT_CLASSIFICATION'
970
+
971
+
972
+ class FunctionToApply(StrEnum):
973
+ """
974
+ Score normalization: 'softmax' for single-label, 'sigmoid' for multi-label, 'none' for raw logits.
975
+ """
976
+
977
+ sigmoid = 'sigmoid'
978
+ softmax = 'softmax'
979
+ none = 'none'
980
+
981
+
982
+ class MaxLength1(RootModel[int]):
983
+ root: int = Field(
984
+ None, description="Override the tokenizer's maximum sequence length.", ge=1
985
+ )
986
+
987
+
988
+ class ChunkSize1(RootModel[int]):
989
+ root: int = Field(
990
+ None,
991
+ description='Split text into chunks of this many characters before classification.',
992
+ ge=1,
993
+ )
994
+
995
+
996
+ class ChunkOverlap1(RootModel[int]):
997
+ root: int = Field(
998
+ 0, description='Character overlap between consecutive chunks.', ge=0
999
+ )
1000
+
1001
+
1002
+ class TextClassificationPipelineSchema(BaseModel):
1003
+ """
1004
+ Text classification pipeline using a HuggingFace fine-tuned model. Runs a single model; create multiple custom detectors to run multiple classifiers.
1005
+ """
1006
+
1007
+ model_config = ConfigDict(
1008
+ extra='forbid',
1009
+ )
1010
+ type: Literal['TEXT_CLASSIFICATION']
1011
+ model: str = Field(
1012
+ ...,
1013
+ description="HuggingFace hub ID (e.g. 'mrm8488/bert-tiny-finetuned-sms-spam-detection') or absolute local directory path.",
1014
+ )
1015
+ model_revision: str | None = Field(
1016
+ None,
1017
+ description='Git branch, tag, or commit hash when fetching from the HuggingFace hub.',
1018
+ )
1019
+ device: str | None = Field(
1020
+ 'cpu',
1021
+ description="Inference device: 'cpu' (default), 'cuda', 'mps', or a CUDA device string like 'cuda:0'.",
1022
+ )
1023
+ top_k: int | None = Field(
1024
+ None,
1025
+ description='Maximum number of top predictions to return. When null all labels above confidence_threshold are returned.',
1026
+ )
1027
+ function_to_apply: FunctionToApply | None = Field(
1028
+ None,
1029
+ description="Score normalization: 'softmax' for single-label, 'sigmoid' for multi-label, 'none' for raw logits.",
1030
+ )
1031
+ confidence_threshold: float | None = Field(
1032
+ 0.7,
1033
+ description='Minimum prediction confidence to report a label as a finding (0-1).',
1034
+ ge=0.0,
1035
+ le=1.0,
1036
+ )
1037
+ severity: Severity | None = Field(
1038
+ 'info', description='Default severity when no severity_map rule matches.'
1039
+ )
1040
+ severity_map: list[PipelineSeverityRule] | None = Field(
1041
+ None,
1042
+ description='Ordered rules mapping predicted labels to severity levels. First matching rule wins.',
1043
+ )
1044
+ max_length: MaxLength1 | None = Field(
1045
+ None, description="Override the tokenizer's maximum sequence length."
1046
+ )
1047
+ chunk_size: ChunkSize1 | None = Field(
1048
+ None,
1049
+ description='Split text into chunks of this many characters before classification.',
1050
+ )
1051
+ chunk_overlap: ChunkOverlap1 | None = Field(
1052
+ 0,
1053
+ description='Character overlap between consecutive chunks.',
1054
+ validate_default=True,
1055
+ )
1056
+
1057
+
1058
+ class Type5(StrEnum):
1059
+ IMAGE_CLASSIFICATION = 'IMAGE_CLASSIFICATION'
1060
+
1061
+
1062
+ class FunctionToApply1(StrEnum):
1063
+ """
1064
+ Score normalization applied after the model forward pass.
1065
+ """
1066
+
1067
+ sigmoid = 'sigmoid'
1068
+ softmax = 'softmax'
1069
+ none = 'none'
1070
+
1071
+
1072
+ class ImageClassificationPipelineSchema(BaseModel):
1073
+ """
1074
+ Image classification pipeline using a HuggingFace vision model. Runs a single model; create multiple custom detectors to run multiple classifiers.
1075
+ """
1076
+
1077
+ model_config = ConfigDict(
1078
+ extra='forbid',
1079
+ )
1080
+ type: Literal['IMAGE_CLASSIFICATION']
1081
+ model: str | None = Field(
1082
+ None,
1083
+ description="HuggingFace hub ID or local path. Defaults to 'google/vit-base-patch16-224' when null.",
1084
+ )
1085
+ model_revision: str | None = Field(
1086
+ None,
1087
+ description='Git branch, tag, or commit hash when fetching from the HuggingFace hub.',
1088
+ )
1089
+ device: str | None = Field(
1090
+ 'cpu',
1091
+ description="Inference device: 'cpu' (default), 'cuda', 'mps', or a CUDA device string like 'cuda:0'.",
1092
+ )
1093
+ top_k: int | None = Field(
1094
+ None, description='Maximum number of top predictions to return per image.'
1095
+ )
1096
+ function_to_apply: FunctionToApply1 | None = Field(
1097
+ None, description='Score normalization applied after the model forward pass.'
1098
+ )
1099
+ confidence_threshold: float | None = Field(
1100
+ 0.0,
1101
+ description='Minimum prediction confidence to report a label as a finding (0-1). Defaults to 0 so all top_k predictions are reported.',
1102
+ ge=0.0,
1103
+ le=1.0,
1104
+ )
1105
+ severity_map: list[PipelineSeverityRule] | None = Field(
1106
+ None,
1107
+ description="Ordered rules mapping predicted labels to severity levels. Labels with no matching rule receive 'info' severity.",
1108
+ )
1109
+
1110
+
1111
+ class Type6(StrEnum):
1112
+ FEATURE_EXTRACTION = 'FEATURE_EXTRACTION'
1113
+
1114
+
1115
+ class PoolingStrategy(StrEnum):
1116
+ """
1117
+ How to aggregate per-token hidden states into a single embedding vector.
1118
+ """
1119
+
1120
+ mean = 'mean'
1121
+ cls = 'cls'
1122
+ max = 'max'
1123
+ none = 'none'
1124
+
1125
+
1126
+ class ChunkSize2(RootModel[int]):
1127
+ root: int = Field(
1128
+ None,
1129
+ description='Split text into chunks of this many characters before embedding. Each chunk produces its own finding.',
1130
+ ge=1,
1131
+ )
1132
+
1133
+
1134
+ class FeatureExtractionPipelineSchema(BaseModel):
1135
+ """
1136
+ Feature extraction (embedding) pipeline using a HuggingFace model. Runs a single encoder; create multiple custom detectors to run multiple embedding models.
1137
+ """
1138
+
1139
+ model_config = ConfigDict(
1140
+ extra='forbid',
1141
+ )
1142
+ type: Literal['FEATURE_EXTRACTION']
1143
+ model: str = Field(
1144
+ ...,
1145
+ description="HuggingFace hub ID (e.g. 'BAAI/bge-base-en-v1.5', 'sentence-transformers/all-MiniLM-L6-v2') or absolute local directory path.",
1146
+ )
1147
+ model_revision: str | None = Field(
1148
+ None,
1149
+ description='Git branch, tag, or commit hash when fetching from the HuggingFace hub.',
1150
+ )
1151
+ device: str | None = Field(
1152
+ 'cpu',
1153
+ description="Inference device: 'cpu' (default), 'cuda', 'mps', or a CUDA device string like 'cuda:0'.",
1154
+ )
1155
+ pooling_strategy: PoolingStrategy | None = Field(
1156
+ 'mean',
1157
+ description='How to aggregate per-token hidden states into a single embedding vector.',
1158
+ )
1159
+ normalize_embeddings: bool | None = Field(
1160
+ True,
1161
+ description='L2-normalise the final embedding vector. Recommended for cosine-similarity workloads.',
1162
+ )
1163
+ truncation: bool | None = Field(
1164
+ True, description="Truncate input to the model's maximum sequence length."
1165
+ )
1166
+ max_length: int | None = Field(
1167
+ None, description="Override the tokenizer's default maximum sequence length."
1168
+ )
1169
+ batch_size: int | None = Field(
1170
+ 8, description='Number of texts to encode in a single forward pass.'
1171
+ )
1172
+ chunk_size: ChunkSize2 | None = Field(
1173
+ None,
1174
+ description='Split text into chunks of this many characters before embedding. Each chunk produces its own finding.',
1175
+ )
1176
+ chunk_overlap: ChunkOverlap1 | None = Field(
1177
+ 0,
1178
+ description='Character overlap between consecutive chunks.',
1179
+ validate_default=True,
1180
+ )
1181
+
1182
+
1183
+ class Type7(StrEnum):
1184
+ OBJECT_DETECTION = 'OBJECT_DETECTION'
1185
+
1186
+
1187
+ class NmsThreshold(RootModel[float]):
1188
+ root: float = Field(
1189
+ None,
1190
+ description="IoU threshold for non-maximum suppression. When null the model's default post-processing is used.",
1191
+ ge=0.0,
1192
+ le=1.0,
1193
+ )
1194
+
1195
+
1196
+ class ObjectDetectionPipelineSchema(BaseModel):
1197
+ """
1198
+ Object detection pipeline using a HuggingFace model. Runs a single detector; create multiple custom detectors to run multiple detection models.
1199
+ """
1200
+
1201
+ model_config = ConfigDict(
1202
+ extra='forbid',
1203
+ )
1204
+ type: Literal['OBJECT_DETECTION']
1205
+ model: str = Field(
1206
+ ...,
1207
+ description="HuggingFace hub ID (e.g. 'facebook/detr-resnet-50', 'hustvl/yolos-small') or absolute local directory path.",
1208
+ )
1209
+ model_revision: str | None = Field(
1210
+ None,
1211
+ description='Git branch, tag, or commit hash when fetching from the HuggingFace hub.',
1212
+ )
1213
+ device: str | None = Field(
1214
+ 'cpu',
1215
+ description="Inference device: 'cpu' (default), 'cuda', 'mps', or a CUDA device string like 'cuda:0'.",
1216
+ )
1217
+ confidence_threshold: float | None = Field(
1218
+ 0.5,
1219
+ description='Minimum detection confidence to report an object as a finding (0-1).',
1220
+ ge=0.0,
1221
+ le=1.0,
1222
+ )
1223
+ top_k: int | None = Field(
1224
+ None, description='Keep only the top-k highest-confidence detections per image.'
1225
+ )
1226
+ nms_threshold: NmsThreshold | None = Field(
1227
+ None,
1228
+ description="IoU threshold for non-maximum suppression. When null the model's default post-processing is used.",
1229
+ )
1230
+ min_box_area: int | None = Field(
1231
+ None,
1232
+ description='Minimum bounding-box area in pixels (width × height). Smaller detections are suppressed.',
1233
+ )
1234
+ severity_map: list[PipelineSeverityRule] | None = Field(
1235
+ None,
1236
+ description="Ordered rules mapping detected object labels to severity levels. Labels with no matching rule receive 'info' severity.",
1237
+ )
1238
+
1239
+
1240
+ class CustomDetectorConfig(DetectorConfig):
1241
+ """
1242
+ Configuration for user-defined detector execution
1243
+ """
1244
+
1245
+ custom_detector_key: str = Field(
1246
+ ..., description='Stable key used to identify one custom detector instance'
1247
+ )
1248
+ name: str = Field(..., description='User-facing name of custom detector')
1249
+ description: str | None = None
1250
+ method: CustomDetectorMethod | None = None
1251
+ languages: list[str] | None = ['de', 'en']
1252
+ ruleset: CustomRulesetConfig | None = None
1253
+ classifier: CustomClassifierConfig | None = None
1254
+ entity: CustomEntityConfig | None = None
1255
+ extractor: CustomExtractorConfig | None = Field(
1256
+ None, description='Optional structured extraction — runs when detector fires'
1257
+ )
1258
+ pipeline_schema: (
1259
+ GLiNER2PipelineSchema
1260
+ | RegexPipelineSchema
1261
+ | LLMPipelineSchema
1262
+ | TextClassificationPipelineSchema
1263
+ | ImageClassificationPipelineSchema
1264
+ | FeatureExtractionPipelineSchema
1265
+ | ObjectDetectionPipelineSchema
1266
+ | None
1267
+ ) = Field(None, discriminator='type', title='AnyPipelineSchema')
1268
+ max_findings: int | None = Field(
1269
+ None, description='Maximum number of findings to return per asset'
1270
+ )
1271
+
1272
+
1273
+ class DetectorsRefactored(
1274
+ RootModel[
1275
+ SecretsDetectorConfig
1276
+ | PIIDetectorConfig
1277
+ | ThreatDetectorConfig
1278
+ | BrokenLinksDetectorConfig
1279
+ | CustomDetectorConfig
1280
+ | CodeSecurityDetectorConfig
1281
+ | GenericDetectorConfig
1282
+ ]
1283
+ ):
1284
+ root: (
1285
+ SecretsDetectorConfig
1286
+ | PIIDetectorConfig
1287
+ | ThreatDetectorConfig
1288
+ | BrokenLinksDetectorConfig
1289
+ | CustomDetectorConfig
1290
+ | CodeSecurityDetectorConfig
1291
+ | GenericDetectorConfig
1292
+ ) = Field(
1293
+ ...,
1294
+ description='Merged detector schemas with all detector types and common definitions',
1295
+ title='DetectorsRefactored',
1296
+ )