classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,2732 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: all_input_sources.json
3
+
4
+ from __future__ import annotations
5
+
6
+ from enum import StrEnum
7
+ from typing import Any, Literal
8
+
9
+ from pydantic import (
10
+ AnyUrl,
11
+ AwareDatetime,
12
+ BaseModel,
13
+ ConfigDict,
14
+ EmailStr,
15
+ Field,
16
+ RootModel,
17
+ )
18
+
19
+
20
+ class AssetType(StrEnum):
21
+ """
22
+ Type of the asset or source
23
+ """
24
+
25
+ WORDPRESS = 'WORDPRESS'
26
+ SLACK = 'SLACK'
27
+ S3_COMPATIBLE_STORAGE = 'S3_COMPATIBLE_STORAGE'
28
+ AZURE_BLOB_STORAGE = 'AZURE_BLOB_STORAGE'
29
+ GOOGLE_CLOUD_STORAGE = 'GOOGLE_CLOUD_STORAGE'
30
+ POSTGRESQL = 'POSTGRESQL'
31
+ MYSQL = 'MYSQL'
32
+ MSSQL = 'MSSQL'
33
+ ORACLE = 'ORACLE'
34
+ HIVE = 'HIVE'
35
+ DATABRICKS = 'DATABRICKS'
36
+ SNOWFLAKE = 'SNOWFLAKE'
37
+ MONGODB = 'MONGODB'
38
+ NEO4J = 'NEO4J'
39
+ POWERBI = 'POWERBI'
40
+ TABLEAU = 'TABLEAU'
41
+ CONFLUENCE = 'CONFLUENCE'
42
+ JIRA = 'JIRA'
43
+ SERVICEDESK = 'SERVICEDESK'
44
+
45
+
46
+ class SourceCategory(StrEnum):
47
+ """
48
+ Category of the source: TABULAR for structured databases (PostgreSQL, MySQL, MSSQL, Oracle, Hive, Databricks Unity Catalog, Snowflake), UNSTRUCTURED for text/web/document sources (WordPress, S3-Compatible Storage, Azure Blob Storage, Google Cloud Storage, Slack, MongoDB, PowerBI, Tableau, Confluence, Jira, Service Desk)
49
+ """
50
+
51
+ TABULAR = 'TABULAR'
52
+ UNSTRUCTURED = 'UNSTRUCTURED'
53
+
54
+
55
+ class DetectorType(StrEnum):
56
+ """
57
+ Type of detector for content analysis
58
+ """
59
+
60
+ SECRETS = 'SECRETS'
61
+ PII = 'PII'
62
+ YARA = 'YARA'
63
+ BROKEN_LINKS = 'BROKEN_LINKS'
64
+ CODE_SECURITY = 'CODE_SECURITY'
65
+ CUSTOM = 'CUSTOM'
66
+
67
+
68
+ class PostStatus(StrEnum):
69
+ """
70
+ WordPress post status
71
+ """
72
+
73
+ publish = 'publish'
74
+ future = 'future'
75
+ draft = 'draft'
76
+ pending = 'pending'
77
+ private = 'private'
78
+
79
+
80
+ class SlackChannelType(StrEnum):
81
+ """
82
+ Slack conversation types to include
83
+ """
84
+
85
+ public_channel = 'public_channel'
86
+ private_channel = 'private_channel'
87
+ mpim = 'mpim'
88
+ im = 'im'
89
+
90
+
91
+ class SamplingStrategy(StrEnum):
92
+ """
93
+ Sampling strategy: RANDOM samples items randomly, LATEST prioritises the most recently modified/created items, ALL scans every item with no limit
94
+ """
95
+
96
+ RANDOM = 'RANDOM'
97
+ LATEST = 'LATEST'
98
+ ALL = 'ALL'
99
+
100
+
101
+ class SamplingConfig(BaseModel):
102
+ """
103
+ Controls how content is extracted from each source. For tabular sources rows_per_page controls both sample size for RANDOM/LATEST and pagination batch size for ALL.
104
+ """
105
+
106
+ model_config = ConfigDict(
107
+ extra='forbid',
108
+ )
109
+ strategy: SamplingStrategy
110
+ enable_ocr: bool | None = Field(
111
+ False,
112
+ description='When true, enable OCR/text extraction for supported binary documents and images before routing text-capable detectors.',
113
+ )
114
+ order_by_column: str | None = Field(
115
+ None,
116
+ description='Column to use for LATEST sampling mode in tabular sources (usually created_at/updated_at). Auto-detected when not set.',
117
+ )
118
+ fallback_to_random: bool | None = Field(
119
+ True,
120
+ description='Tabular sources only. Fallback to RANDOM ordering when LATEST mode cannot resolve an ordering column.',
121
+ )
122
+ include_column_names: bool | None = Field(
123
+ True,
124
+ description='Tabular sources only. Include column names in sampled detector payload rows.',
125
+ )
126
+ rows_per_page: int | None = Field(
127
+ 100,
128
+ description='Tabular sources only. Number of rows per sample (RANDOM/LATEST) or per pagination batch (ALL). Controls memory usage during large table scans.',
129
+ ge=10,
130
+ le=10000,
131
+ )
132
+
133
+
134
+ class Requests(BaseModel):
135
+ model_config = ConfigDict(
136
+ extra='forbid',
137
+ )
138
+ cpu: str | None = Field(None, description='CPU request (e.g. 500m)')
139
+ memory: str | None = Field(None, description='Memory request (e.g. 1Gi)')
140
+
141
+
142
+ class Limits(BaseModel):
143
+ model_config = ConfigDict(
144
+ extra='forbid',
145
+ )
146
+ cpu: str | None = Field(None, description='CPU limit (e.g. 2)')
147
+ memory: str | None = Field(None, description='Memory limit (e.g. 4Gi)')
148
+
149
+
150
+ class ResourceOverrides(BaseModel):
151
+ """
152
+ Override K8s job resources and timeout for this source.
153
+ """
154
+
155
+ model_config = ConfigDict(
156
+ extra='forbid',
157
+ )
158
+ requests: Requests | None = None
159
+ limits: Limits | None = None
160
+ timeout_seconds: int | None = Field(
161
+ None,
162
+ description='Max runtime in seconds (overrides activeDeadlineSeconds)',
163
+ ge=60,
164
+ le=86400,
165
+ )
166
+ processing_workers: int | None = Field(
167
+ None,
168
+ description='Number of parallel asset-processing workers in Phase 2 (default: 2)',
169
+ ge=1,
170
+ le=20,
171
+ )
172
+ detector_max_concurrent: int | None = Field(
173
+ None,
174
+ description='Max concurrent detector invocations across all pages (default: 5)',
175
+ ge=1,
176
+ le=50,
177
+ )
178
+
179
+
180
+ class Detector(BaseModel):
181
+ model_config = ConfigDict(
182
+ extra='allow',
183
+ )
184
+ type: DetectorType
185
+ enabled: bool | None = True
186
+ config: dict[str, Any] | None = Field(
187
+ None, description='Detector-specific configuration'
188
+ )
189
+
190
+
191
+ class CustomDetectorSelection(RootModel[str]):
192
+ root: str = Field(..., min_length=1)
193
+
194
+
195
+ class WordPressRequired(BaseModel):
196
+ model_config = ConfigDict(
197
+ extra='forbid',
198
+ )
199
+ url: AnyUrl = Field(
200
+ ..., description='Base URL of the WordPress site (e.g., https://example.com)'
201
+ )
202
+
203
+
204
+ class WordPressMasked(BaseModel):
205
+ model_config = ConfigDict(
206
+ extra='forbid',
207
+ )
208
+ username: str | None = Field(
209
+ None, description='Username for authentication (optional for public content)'
210
+ )
211
+ application_password: str | None = Field(
212
+ None, description='WordPress application password for authentication (optional)'
213
+ )
214
+
215
+
216
+ class WordPressOptionalContent(BaseModel):
217
+ """
218
+ Content scope and filters for WordPress ingestion.
219
+ """
220
+
221
+ model_config = ConfigDict(
222
+ extra='forbid',
223
+ )
224
+ fetch_posts: bool | None = Field(True, description='Whether to fetch blog posts')
225
+ fetch_pages: bool | None = Field(True, description='Whether to fetch pages')
226
+ post_status: list[PostStatus] | None = Field(
227
+ ['publish'],
228
+ description='Post status filters (requires authentication for non-public statuses)',
229
+ )
230
+
231
+
232
+ class WordPressOptional(BaseModel):
233
+ model_config = ConfigDict(
234
+ extra='forbid',
235
+ )
236
+ content: WordPressOptionalContent | None = None
237
+
238
+
239
+ class SlackRequired(BaseModel):
240
+ model_config = ConfigDict(
241
+ extra='forbid',
242
+ )
243
+ workspace: str | None = Field(
244
+ None, description='Slack workspace name or domain (for display and stable IDs)'
245
+ )
246
+
247
+
248
+ class SlackMaskedBotToken(BaseModel):
249
+ model_config = ConfigDict(
250
+ extra='forbid',
251
+ )
252
+ bot_token: str = Field(..., description='Slack bot token (starts with xoxb-)')
253
+
254
+
255
+ class SlackMaskedUserToken(BaseModel):
256
+ model_config = ConfigDict(
257
+ extra='forbid',
258
+ )
259
+ user_token: str = Field(..., description='Slack user token (starts with xoxp-)')
260
+
261
+
262
+ class TokenType(StrEnum):
263
+ """
264
+ Token type hint
265
+ """
266
+
267
+ bot = 'bot'
268
+ user = 'user'
269
+
270
+
271
+ class SlackMaskedToken(BaseModel):
272
+ model_config = ConfigDict(
273
+ extra='forbid',
274
+ )
275
+ token: str = Field(..., description='Slack token (bot or user)')
276
+ token_type: TokenType | None = Field(None, description='Token type hint')
277
+
278
+
279
+ class SlackOptionalChannels(BaseModel):
280
+ """
281
+ Channel discovery and targeting controls.
282
+ """
283
+
284
+ model_config = ConfigDict(
285
+ extra='forbid',
286
+ )
287
+ channel_types: list[SlackChannelType] | None = Field(
288
+ ['public_channel'],
289
+ description='Slack conversation types to include when listing channels',
290
+ )
291
+ channel_ids: list[str] | None = Field(
292
+ None,
293
+ description='Explicit channel IDs to scan. If provided, channel_types is ignored.',
294
+ )
295
+ exclude_archived: bool | None = Field(
296
+ True, description='Exclude archived channels when listing'
297
+ )
298
+
299
+
300
+ class SlackOptionalTimeRange(BaseModel):
301
+ """
302
+ Time window filters for message ingestion.
303
+ """
304
+
305
+ model_config = ConfigDict(
306
+ extra='forbid',
307
+ )
308
+ oldest: str | None = Field(
309
+ None, description='Start of date range (Slack timestamp or ISO 8601)'
310
+ )
311
+ latest: str | None = Field(
312
+ None, description='End of date range (Slack timestamp or ISO 8601)'
313
+ )
314
+
315
+
316
+ class SlackOptionalIngestion(BaseModel):
317
+ """
318
+ Throughput and payload controls for Slack ingestion.
319
+ """
320
+
321
+ model_config = ConfigDict(
322
+ extra='forbid',
323
+ )
324
+ batch_size: int | None = Field(
325
+ 200, description='Messages per API call (max 200)', ge=1, le=200
326
+ )
327
+ rate_limit_delay_seconds: float | None = Field(
328
+ 1, description='Delay between API calls to avoid rate limits', ge=0.0
329
+ )
330
+ include_thread_replies: bool | None = Field(
331
+ False, description='Include thread replies in fetched content for detectors'
332
+ )
333
+
334
+
335
+ class SlackOptional(BaseModel):
336
+ model_config = ConfigDict(
337
+ extra='forbid',
338
+ )
339
+ channels: SlackOptionalChannels | None = None
340
+ time_range: SlackOptionalTimeRange | None = None
341
+ ingestion: SlackOptionalIngestion | None = None
342
+
343
+
344
+ class ObjectStorageOptionalScope(BaseModel):
345
+ """
346
+ Object scope and filtering controls.
347
+ """
348
+
349
+ model_config = ConfigDict(
350
+ extra='forbid',
351
+ )
352
+ prefix: str | None = Field(
353
+ None, description='Object key prefix filter (for example, exports/2026/)'
354
+ )
355
+ include_extensions: list[str] | None = Field(
356
+ None,
357
+ description='Optional extension allowlist (for example, .pdf, .csv, .parquet)',
358
+ )
359
+ exclude_extensions: list[str] | None = Field(
360
+ None, description='Optional extension denylist'
361
+ )
362
+ include_empty_objects: bool | None = Field(
363
+ False, description='Include zero-byte objects in extraction results'
364
+ )
365
+ include_object_metadata: bool | None = Field(
366
+ True,
367
+ description='Attach provider metadata (etag, size, content-type hints, timestamps) to asset checksums',
368
+ )
369
+ include_content_preview: bool | None = Field(
370
+ True,
371
+ description='Download object bytes to infer MIME and extract detector-ready text previews',
372
+ )
373
+
374
+
375
+ class S3CompatibleStorageRequired(BaseModel):
376
+ model_config = ConfigDict(
377
+ extra='forbid',
378
+ )
379
+ bucket: str = Field(
380
+ ...,
381
+ description='Bucket name for AWS S3, MinIO, Cloudflare R2, Backblaze B2, Garage, and other S3-compatible endpoints',
382
+ )
383
+
384
+
385
+ class S3CompatibleStorageMasked(BaseModel):
386
+ """
387
+ Optional static credentials. Leave empty to use ambient AWS credentials chain.
388
+ """
389
+
390
+ model_config = ConfigDict(
391
+ extra='forbid',
392
+ )
393
+ aws_access_key_id: str | None = Field(
394
+ None, description='S3-compatible access key ID'
395
+ )
396
+ aws_secret_access_key: str | None = Field(
397
+ None, description='S3-compatible secret access key'
398
+ )
399
+ aws_session_token: str | None = Field(
400
+ None, description='Optional session token for temporary credentials'
401
+ )
402
+
403
+
404
+ class S3CompatibleStorageOptionalConnection(BaseModel):
405
+ model_config = ConfigDict(
406
+ extra='forbid',
407
+ )
408
+ region_name: str | None = Field(
409
+ None,
410
+ description='Region (recommended for AWS; required by some S3-compatible providers)',
411
+ )
412
+ endpoint_url: AnyUrl | None = Field(
413
+ None,
414
+ description='Custom endpoint URL for MinIO/R2/B2/Garage and other S3-compatible providers',
415
+ )
416
+ request_timeout_seconds: float | None = Field(
417
+ 30,
418
+ description='Network timeout in seconds for list/download operations',
419
+ ge=1.0,
420
+ le=300.0,
421
+ )
422
+ max_keys_per_page: int | None = Field(
423
+ 200,
424
+ description='Maximum objects requested per provider list API call',
425
+ ge=1,
426
+ le=1000,
427
+ )
428
+ max_object_bytes: int | None = Field(
429
+ 5242880,
430
+ description='Maximum bytes downloaded per object for MIME detection and text extraction',
431
+ ge=1024,
432
+ le=52428800,
433
+ )
434
+ verify_ssl: bool | None = Field(
435
+ True, description='TLS certificate verification toggle'
436
+ )
437
+
438
+
439
+ class S3CompatibleStorageOptional(BaseModel):
440
+ model_config = ConfigDict(
441
+ extra='forbid',
442
+ )
443
+ connection: S3CompatibleStorageOptionalConnection | None = None
444
+ scope: ObjectStorageOptionalScope | None = None
445
+
446
+
447
+ class AzureBlobStorageRequired(BaseModel):
448
+ model_config = ConfigDict(
449
+ extra='forbid',
450
+ )
451
+ account_url: AnyUrl = Field(
452
+ ...,
453
+ description='Azure Blob account URL (for example, https://<account>.blob.core.windows.net)',
454
+ )
455
+ container: str = Field(..., description='Azure Blob container name')
456
+
457
+
458
+ class AzureBlobStorageMasked(BaseModel):
459
+ """
460
+ Optional Azure credentials. Leave empty to use managed identity/default credential chain.
461
+ """
462
+
463
+ model_config = ConfigDict(
464
+ extra='forbid',
465
+ )
466
+ azure_connection_string: str | None = Field(
467
+ None,
468
+ description='Azure storage connection string (takes precedence over other auth fields)',
469
+ )
470
+ azure_account_key: str | None = Field(None, description='Azure storage account key')
471
+ azure_sas_token: str | None = Field(None, description='Azure SAS token')
472
+ azure_client_id: str | None = Field(
473
+ None, description='Azure Entra client ID (service principal auth)'
474
+ )
475
+ azure_client_secret: str | None = Field(
476
+ None, description='Azure Entra client secret (service principal auth)'
477
+ )
478
+ azure_tenant_id: str | None = Field(
479
+ None, description='Azure Entra tenant ID (service principal auth)'
480
+ )
481
+
482
+
483
+ class AzureBlobStorageOptionalConnection(BaseModel):
484
+ model_config = ConfigDict(
485
+ extra='forbid',
486
+ )
487
+ request_timeout_seconds: float | None = Field(
488
+ 30,
489
+ description='Network timeout in seconds for list/download operations',
490
+ ge=1.0,
491
+ le=300.0,
492
+ )
493
+ max_keys_per_page: int | None = Field(
494
+ 200, description='Maximum blobs requested per list page', ge=1, le=1000
495
+ )
496
+ max_object_bytes: int | None = Field(
497
+ 5242880,
498
+ description='Maximum bytes downloaded per blob for MIME detection and text extraction',
499
+ ge=1024,
500
+ le=52428800,
501
+ )
502
+
503
+
504
+ class AzureBlobStorageOptional(BaseModel):
505
+ model_config = ConfigDict(
506
+ extra='forbid',
507
+ )
508
+ connection: AzureBlobStorageOptionalConnection | None = None
509
+ scope: ObjectStorageOptionalScope | None = None
510
+
511
+
512
+ class GoogleCloudStorageRequired(BaseModel):
513
+ model_config = ConfigDict(
514
+ extra='forbid',
515
+ )
516
+ bucket: str = Field(..., description='Google Cloud Storage bucket name')
517
+
518
+
519
+ class GoogleCloudStorageMasked(BaseModel):
520
+ """
521
+ Optional inline service account credentials JSON. Leave empty to use ADC/workload identity.
522
+ """
523
+
524
+ model_config = ConfigDict(
525
+ extra='forbid',
526
+ )
527
+ gcp_credentials_json: str | None = Field(
528
+ None, description='Google service account credentials JSON as inline string'
529
+ )
530
+
531
+
532
+ class GoogleCloudStorageOptionalConnection(BaseModel):
533
+ model_config = ConfigDict(
534
+ extra='forbid',
535
+ )
536
+ project_id: str | None = Field(
537
+ None,
538
+ description='Optional GCP project ID override for auth context and bucket listing',
539
+ )
540
+ gcp_credentials_file: str | None = Field(
541
+ None, description='Path to Google service account JSON credentials file'
542
+ )
543
+ request_timeout_seconds: float | None = Field(
544
+ 30,
545
+ description='Network timeout in seconds for list/download operations',
546
+ ge=1.0,
547
+ le=300.0,
548
+ )
549
+ max_keys_per_page: int | None = Field(
550
+ 200, description='Maximum objects requested per list page', ge=1, le=1000
551
+ )
552
+ max_object_bytes: int | None = Field(
553
+ 5242880,
554
+ description='Maximum bytes downloaded per object for MIME detection and text extraction',
555
+ ge=1024,
556
+ le=52428800,
557
+ )
558
+
559
+
560
+ class GoogleCloudStorageOptional(BaseModel):
561
+ model_config = ConfigDict(
562
+ extra='forbid',
563
+ )
564
+ connection: GoogleCloudStorageOptionalConnection | None = None
565
+ scope: ObjectStorageOptionalScope | None = None
566
+
567
+
568
+ class PostgreSQLSSLMode(StrEnum):
569
+ """
570
+ SSL mode for PostgreSQL connection
571
+ """
572
+
573
+ disable = 'disable'
574
+ allow = 'allow'
575
+ prefer = 'prefer'
576
+ require = 'require'
577
+ verify_ca = 'verify-ca'
578
+ verify_full = 'verify-full'
579
+
580
+
581
+ class PostgreSQLRequired(BaseModel):
582
+ model_config = ConfigDict(
583
+ extra='forbid',
584
+ )
585
+ host: str = Field(..., description='PostgreSQL host')
586
+ port: int = Field(..., description='PostgreSQL port', ge=1, le=65535)
587
+
588
+
589
+ class PostgreSQLMasked(BaseModel):
590
+ model_config = ConfigDict(
591
+ extra='forbid',
592
+ )
593
+ username: str = Field(..., description='Database username')
594
+ password: str = Field(..., description='Database password')
595
+
596
+
597
+ class PostgreSQLOptionalConnection(BaseModel):
598
+ """
599
+ Connection tuning and SSL behavior.
600
+ """
601
+
602
+ model_config = ConfigDict(
603
+ extra='forbid',
604
+ )
605
+ ssl_mode: PostgreSQLSSLMode | None = 'prefer'
606
+ connect_timeout_seconds: int | None = Field(
607
+ 10, description='Connection timeout in seconds', ge=1, le=120
608
+ )
609
+
610
+
611
+ class PostgreSQLOptionalScope(BaseModel):
612
+ """
613
+ Database, schema, and table selection scope.
614
+ """
615
+
616
+ model_config = ConfigDict(
617
+ extra='forbid',
618
+ )
619
+ database: str | None = Field(
620
+ None,
621
+ description='Single database to scan (optional when include_all_databases is true)',
622
+ )
623
+ include_all_databases: bool | None = Field(
624
+ False, description='Scan all non-template databases visible to this user'
625
+ )
626
+ maintenance_database: str | None = Field(
627
+ 'postgres',
628
+ description='Database used for database enumeration when include_all_databases is true',
629
+ )
630
+ include_schemas: list[str] | None = Field(
631
+ None, description='Optional schema allowlist (exact schema names)'
632
+ )
633
+ exclude_schemas: list[str] | None = Field(
634
+ ['information_schema', 'pg_catalog', 'pg_toast'],
635
+ description='Schema denylist (exact schema names)',
636
+ )
637
+ include_tables: list[str] | None = Field(
638
+ None,
639
+ description='Optional table allowlist. Accepted forms: schema.table or database.schema.table',
640
+ )
641
+ table_limit: int | None = Field(
642
+ None, description='Optional cap on number of table assets extracted', ge=1
643
+ )
644
+
645
+
646
+ class PostgreSQLOptional(BaseModel):
647
+ model_config = ConfigDict(
648
+ extra='forbid',
649
+ )
650
+ connection: PostgreSQLOptionalConnection | None = None
651
+ scope: PostgreSQLOptionalScope | None = None
652
+
653
+
654
+ class MySQLSSLMode(StrEnum):
655
+ """
656
+ SSL/TLS connection mode. DISABLED: no TLS; PREFERRED: TLS when available (default); REQUIRED: mandate TLS without certificate verification; VERIFY_CA: mandate TLS and verify the CA certificate (requires ssl_ca); VERIFY_IDENTITY: mandate TLS, verify CA, and verify server hostname.
657
+ """
658
+
659
+ DISABLED = 'DISABLED'
660
+ PREFERRED = 'PREFERRED'
661
+ REQUIRED = 'REQUIRED'
662
+ VERIFY_CA = 'VERIFY_CA'
663
+ VERIFY_IDENTITY = 'VERIFY_IDENTITY'
664
+
665
+
666
+ class MySQLRequired(BaseModel):
667
+ model_config = ConfigDict(
668
+ extra='forbid',
669
+ )
670
+ host: str = Field(..., description='MySQL host')
671
+ port: int = Field(..., description='MySQL port', ge=1, le=65535)
672
+
673
+
674
+ class MySQLMasked(BaseModel):
675
+ model_config = ConfigDict(
676
+ extra='forbid',
677
+ )
678
+ username: str = Field(..., description='Database username')
679
+ password: str = Field(..., description='Database password')
680
+ ssl_ca: str | None = Field(
681
+ None,
682
+ description='PEM-encoded CA certificate for SSL/TLS verification. Paste the full certificate content (-----BEGIN CERTIFICATE----- ... -----END CERTIFICATE-----). Required when ssl_mode is VERIFY_CA or VERIFY_IDENTITY.',
683
+ )
684
+
685
+
686
+ class MySQLOptionalConnection(BaseModel):
687
+ """
688
+ Connection tuning for MySQL.
689
+ """
690
+
691
+ model_config = ConfigDict(
692
+ extra='forbid',
693
+ )
694
+ connect_timeout_seconds: int | None = Field(
695
+ 10, description='Connection timeout in seconds', ge=1, le=120
696
+ )
697
+ ssl_mode: MySQLSSLMode | None = 'PREFERRED'
698
+ allow_public_key_retrieval: bool | None = Field(
699
+ False,
700
+ description='Allow automatic RSA public key retrieval from the server for caching_sha2_password authentication (MySQL 8+). Only needed when not using SSL and connecting to MySQL 8 servers using the default authentication plugin.',
701
+ )
702
+
703
+
704
+ class MySQLOptionalScope(BaseModel):
705
+ """
706
+ Database and table selection scope.
707
+ """
708
+
709
+ model_config = ConfigDict(
710
+ extra='forbid',
711
+ )
712
+ database: str | None = Field(
713
+ None,
714
+ description='Single database to scan (optional when include_all_databases is true)',
715
+ )
716
+ include_all_databases: bool | None = Field(
717
+ False, description='Scan all visible databases except excluded system databases'
718
+ )
719
+ exclude_databases: list[str] | None = Field(
720
+ ['information_schema', 'mysql', 'performance_schema', 'sys'],
721
+ description='Database denylist (exact database names)',
722
+ )
723
+ include_tables: list[str] | None = Field(
724
+ None,
725
+ description='Optional table allowlist. Accepted forms: table or database.table',
726
+ )
727
+ table_limit: int | None = Field(
728
+ None,
729
+ description='Optional cap on number of table assets extracted per database',
730
+ ge=1,
731
+ )
732
+
733
+
734
+ class MySQLOptional(BaseModel):
735
+ model_config = ConfigDict(
736
+ extra='forbid',
737
+ )
738
+ connection: MySQLOptionalConnection | None = None
739
+ scope: MySQLOptionalScope | None = None
740
+
741
+
742
+ class MSSQLRequired(BaseModel):
743
+ model_config = ConfigDict(
744
+ extra='forbid',
745
+ )
746
+ host: str = Field(..., description='SQL Server host endpoint')
747
+ port: int = Field(..., description='SQL Server TCP port', ge=1, le=65535)
748
+
749
+
750
+ class MSSQLMasked(BaseModel):
751
+ model_config = ConfigDict(
752
+ extra='forbid',
753
+ )
754
+ username: str = Field(..., description='SQL Server login username')
755
+ password: str = Field(..., description='SQL Server login password')
756
+
757
+
758
+ class AuthMode(StrEnum):
759
+ """
760
+ Authentication mode. CUSTOM uses masked.username as-is, LDAP prefixes username with ldap_domain when provided.
761
+ """
762
+
763
+ CUSTOM = 'CUSTOM'
764
+ LDAP = 'LDAP'
765
+
766
+
767
+ class MSSQLOptionalConnection(BaseModel):
768
+ """
769
+ Connection tuning for SQL Server.
770
+ """
771
+
772
+ model_config = ConfigDict(
773
+ extra='forbid',
774
+ )
775
+ auth_mode: AuthMode | None = Field(
776
+ 'CUSTOM',
777
+ description='Authentication mode. CUSTOM uses masked.username as-is, LDAP prefixes username with ldap_domain when provided.',
778
+ )
779
+ ldap_domain: str | None = Field(
780
+ None,
781
+ description='Optional LDAP/AD domain for LDAP auth mode (for example, CORP or corp.local).',
782
+ )
783
+ is_aws_rds: bool | None = Field(
784
+ None,
785
+ description='Set true for AWS RDS SQL Server, false for on-prem. If unset, runtime auto-detects using host patterns.',
786
+ )
787
+ connect_timeout_seconds: int | None = Field(
788
+ 10, description='Connection timeout in seconds', ge=1, le=120
789
+ )
790
+
791
+
792
+ class MSSQLOptionalExtraction(BaseModel):
793
+ """
794
+ Lineage and advanced metadata extraction controls for SQL Server.
795
+ """
796
+
797
+ model_config = ConfigDict(
798
+ extra='forbid',
799
+ )
800
+ include_table_lineage: bool | None = Field(
801
+ True,
802
+ description='Include table-level lineage links using foreign key metadata.',
803
+ )
804
+ include_view_lineage: bool | None = Field(
805
+ True,
806
+ description='Include view-to-table/view lineage links using SQL Server dependency metadata.',
807
+ )
808
+ include_view_column_lineage: bool | None = Field(
809
+ True, description='Enable view column lineage extraction when available.'
810
+ )
811
+ include_stored_procedures: bool | None = Field(
812
+ True, description='Include stored procedure metadata extraction.'
813
+ )
814
+ include_stored_procedures_code: bool | None = Field(
815
+ True,
816
+ description='Include stored procedure source code metadata when available.',
817
+ )
818
+ include_jobs: bool | None = Field(
819
+ True, description='Include SQL Server Agent jobs metadata extraction.'
820
+ )
821
+ include_query_lineage: bool | None = Field(
822
+ False,
823
+ description='Enable query-based lineage extraction from Query Store/DMVs.',
824
+ )
825
+ max_queries_to_extract: int | None = Field(
826
+ 1000,
827
+ description='Maximum number of queries to analyze for query-based lineage.',
828
+ ge=1,
829
+ le=10000,
830
+ )
831
+ min_query_calls: int | None = Field(
832
+ 1,
833
+ description='Minimum execution count for queries to be included in query-based lineage.',
834
+ ge=1,
835
+ )
836
+ query_exclude_patterns: list[str] | None = Field(
837
+ None,
838
+ description='SQL LIKE patterns used to exclude queries from query-based lineage.',
839
+ max_length=100,
840
+ )
841
+ include_usage_statistics: bool | None = Field(
842
+ False, description='Enable usage statistics extraction from SQL query metadata.'
843
+ )
844
+
845
+
846
+ class MSSQLOptionalScope(BaseModel):
847
+ """
848
+ Database, schema, and object selection scope.
849
+ """
850
+
851
+ model_config = ConfigDict(
852
+ extra='forbid',
853
+ )
854
+ database: str | None = Field(
855
+ None,
856
+ description='Single database to scan (optional when include_all_databases is true)',
857
+ )
858
+ include_all_databases: bool | None = Field(
859
+ False, description='Scan all visible databases except excluded system databases'
860
+ )
861
+ exclude_databases: list[str] | None = Field(
862
+ ['master', 'tempdb', 'model'],
863
+ description='Database denylist (exact database names)',
864
+ )
865
+ include_schemas: list[str] | None = Field(
866
+ None, description='Optional schema allowlist (exact schema names)'
867
+ )
868
+ exclude_schemas: list[str] | None = Field(
869
+ ['INFORMATION_SCHEMA', 'sys'],
870
+ description='Schema denylist (exact schema names)',
871
+ )
872
+ include_tables: bool | None = Field(
873
+ True, description='Include table assets in extraction'
874
+ )
875
+ include_views: bool | None = Field(
876
+ True, description='Include view assets in extraction'
877
+ )
878
+ include_objects: list[str] | None = Field(
879
+ None,
880
+ description='Optional object allowlist. Accepted forms: schema.object or database.schema.object',
881
+ )
882
+ table_limit: int | None = Field(
883
+ None, description='Optional cap on number of table/view assets extracted', ge=1
884
+ )
885
+
886
+
887
+ class MSSQLOptional(BaseModel):
888
+ model_config = ConfigDict(
889
+ extra='forbid',
890
+ )
891
+ connection: MSSQLOptionalConnection | None = None
892
+ scope: MSSQLOptionalScope | None = None
893
+ extraction: MSSQLOptionalExtraction | None = None
894
+
895
+
896
+ class OracleRequired(BaseModel):
897
+ model_config = ConfigDict(
898
+ extra='forbid',
899
+ )
900
+ host: str = Field(..., description='Oracle host endpoint')
901
+ port: int = Field(..., description='Oracle TCP port', ge=1, le=65535)
902
+ service_name: str = Field(
903
+ ..., description='Oracle service name (for example, TEST_PDB)'
904
+ )
905
+
906
+
907
+ class OracleMasked(BaseModel):
908
+ model_config = ConfigDict(
909
+ extra='forbid',
910
+ )
911
+ username: str = Field(..., description='Oracle login username')
912
+ password: str = Field(..., description='Oracle login password')
913
+
914
+
915
+ class OracleOptionalConnection(BaseModel):
916
+ """
917
+ Connection tuning for Oracle.
918
+ """
919
+
920
+ model_config = ConfigDict(
921
+ extra='forbid',
922
+ )
923
+ connect_timeout_seconds: int | None = Field(
924
+ 10, description='Connection timeout in seconds', ge=1, le=120
925
+ )
926
+
927
+
928
+ class OracleOptionalScope(BaseModel):
929
+ """
930
+ Schema, object, and lineage extraction scope.
931
+ """
932
+
933
+ model_config = ConfigDict(
934
+ extra='forbid',
935
+ )
936
+ include_schemas: list[str] | None = Field(
937
+ None, description='Optional schema allowlist (exact schema names)'
938
+ )
939
+ exclude_schemas: list[str] | None = Field(
940
+ [
941
+ 'SYS',
942
+ 'SYSTEM',
943
+ 'DBSNMP',
944
+ 'WMSYS',
945
+ 'CTXSYS',
946
+ 'XDB',
947
+ 'MDSYS',
948
+ 'ORDSYS',
949
+ 'OUTLN',
950
+ 'ORDDATA',
951
+ ],
952
+ description='Schema denylist (exact schema names)',
953
+ )
954
+ include_tables: bool | None = Field(
955
+ True, description='Include table assets in extraction'
956
+ )
957
+ include_views: bool | None = Field(
958
+ True, description='Include view assets in extraction'
959
+ )
960
+ include_view_lineage: bool | None = Field(
961
+ True,
962
+ description='Extract coarse lineage links from views to referenced tables/views',
963
+ )
964
+ include_view_column_lineage: bool | None = Field(
965
+ True,
966
+ description='Enable view column lineage collection from Oracle dependency metadata',
967
+ )
968
+ include_objects: list[str] | None = Field(
969
+ None,
970
+ description='Optional object allowlist. Accepted forms: schema.object or service.schema.object',
971
+ )
972
+ table_limit: int | None = Field(
973
+ None, description='Optional cap on number of table/view assets extracted', ge=1
974
+ )
975
+
976
+
977
+ class OracleOptional(BaseModel):
978
+ model_config = ConfigDict(
979
+ extra='forbid',
980
+ )
981
+ connection: OracleOptionalConnection | None = None
982
+ scope: OracleOptionalScope | None = None
983
+
984
+
985
+ class HiveScheme(StrEnum):
986
+ """
987
+ Hive transport and driver scheme
988
+ """
989
+
990
+ hive = 'hive'
991
+ hive_http = 'hive+http'
992
+ hive_https = 'hive+https'
993
+ sparksql = 'sparksql'
994
+ databricks_pyhive = 'databricks+pyhive'
995
+
996
+
997
+ class HiveRequired(BaseModel):
998
+ model_config = ConfigDict(
999
+ extra='forbid',
1000
+ )
1001
+ host: str = Field(..., description='Hive host endpoint')
1002
+ port: int = Field(..., description='Hive TCP port', ge=1, le=65535)
1003
+
1004
+
1005
+ class HiveMasked(BaseModel):
1006
+ model_config = ConfigDict(
1007
+ extra='forbid',
1008
+ )
1009
+ username: str = Field(..., description='Hive login username')
1010
+ password: str = Field(..., description='Hive login password')
1011
+
1012
+
1013
+ class HiveOptionalConnection(BaseModel):
1014
+ """
1015
+ Hive connection transport and authentication options.
1016
+ """
1017
+
1018
+ model_config = ConfigDict(
1019
+ extra='forbid',
1020
+ )
1021
+ scheme: HiveScheme | None = None
1022
+ connect_args: dict[str, Any] | None = Field(
1023
+ {},
1024
+ description='Additional PyHive connection arguments (e.g. auth, kerberos_service_name, http_path).',
1025
+ )
1026
+
1027
+
1028
+ class HiveOptionalScope(BaseModel):
1029
+ """
1030
+ Hive database and object selection scope.
1031
+ """
1032
+
1033
+ model_config = ConfigDict(
1034
+ extra='forbid',
1035
+ )
1036
+ database: str | None = Field(
1037
+ None,
1038
+ description='Single Hive database to scan (optional when include_all_databases is true)',
1039
+ )
1040
+ include_all_databases: bool | None = Field(
1041
+ False,
1042
+ description='Scan all visible Hive databases except excluded system databases',
1043
+ )
1044
+ exclude_databases: list[str] | None = Field(
1045
+ ['information_schema', 'sys'],
1046
+ description='Database denylist (exact database names)',
1047
+ )
1048
+ include_tables: bool | None = Field(
1049
+ True, description='Include table assets in extraction'
1050
+ )
1051
+ include_views: bool | None = Field(
1052
+ True, description='Include view assets in extraction'
1053
+ )
1054
+ include_objects: list[str] | None = Field(
1055
+ None,
1056
+ description='Optional object allowlist. Accepted forms: table or database.table',
1057
+ )
1058
+ table_limit: int | None = Field(
1059
+ None,
1060
+ description='Optional cap on number of table/view assets extracted per database',
1061
+ ge=1,
1062
+ )
1063
+
1064
+
1065
+ class HiveOptional(BaseModel):
1066
+ model_config = ConfigDict(
1067
+ extra='forbid',
1068
+ )
1069
+ connection: HiveOptionalConnection | None = None
1070
+ scope: HiveOptionalScope | None = None
1071
+
1072
+
1073
+ class DatabricksAuthMode(StrEnum):
1074
+ """
1075
+ Databricks authentication mode
1076
+ """
1077
+
1078
+ PAT_TOKEN = 'PAT_TOKEN'
1079
+ SERVICE_PRINCIPAL = 'SERVICE_PRINCIPAL'
1080
+
1081
+
1082
+ class DatabricksRequiredPat(BaseModel):
1083
+ model_config = ConfigDict(
1084
+ extra='forbid',
1085
+ )
1086
+ auth_mode: Literal['PAT_TOKEN']
1087
+ workspace_url: AnyUrl = Field(
1088
+ ...,
1089
+ description='Databricks workspace URL (for example, https://adb-1234567890123456.7.azuredatabricks.net)',
1090
+ )
1091
+ warehouse_id: str = Field(
1092
+ ..., description='Databricks SQL warehouse ID used for sampling queries'
1093
+ )
1094
+
1095
+
1096
+ class DatabricksRequiredServicePrincipal(BaseModel):
1097
+ model_config = ConfigDict(
1098
+ extra='forbid',
1099
+ )
1100
+ auth_mode: Literal['SERVICE_PRINCIPAL']
1101
+ workspace_url: AnyUrl = Field(
1102
+ ...,
1103
+ description='Databricks workspace URL (for example, https://adb-1234567890123456.7.azuredatabricks.net)',
1104
+ )
1105
+ warehouse_id: str = Field(
1106
+ ..., description='Databricks SQL warehouse ID used for sampling queries'
1107
+ )
1108
+ client_id: str = Field(..., description='Databricks service principal client ID')
1109
+
1110
+
1111
+ class DatabricksMaskedPat(BaseModel):
1112
+ model_config = ConfigDict(
1113
+ extra='forbid',
1114
+ )
1115
+ token: str = Field(..., description='Databricks personal access token (PAT)')
1116
+
1117
+
1118
+ class DatabricksMaskedServicePrincipal(BaseModel):
1119
+ model_config = ConfigDict(
1120
+ extra='forbid',
1121
+ )
1122
+ client_secret: str = Field(
1123
+ ..., description='Databricks service principal client secret'
1124
+ )
1125
+
1126
+
1127
+ class DatabricksOptionalConnection(BaseModel):
1128
+ """
1129
+ Databricks API and SQL statement execution tuning options.
1130
+ """
1131
+
1132
+ model_config = ConfigDict(
1133
+ extra='forbid',
1134
+ )
1135
+ timeout_seconds: int | None = Field(
1136
+ 30, description='HTTP timeout for Databricks API calls', ge=5, le=300
1137
+ )
1138
+ statement_timeout_seconds: int | None = Field(
1139
+ 60, description='Maximum wait timeout for SQL statement execution', ge=5, le=600
1140
+ )
1141
+ max_statement_polls: int | None = Field(
1142
+ 30,
1143
+ description='Maximum polling attempts when waiting for SQL statement completion',
1144
+ ge=1,
1145
+ le=120,
1146
+ )
1147
+
1148
+
1149
+ class DatabricksOptionalScope(BaseModel):
1150
+ """
1151
+ Databricks Unity Catalog scope filters.
1152
+ """
1153
+
1154
+ model_config = ConfigDict(
1155
+ extra='forbid',
1156
+ )
1157
+ include_catalogs: list[str] | None = Field(
1158
+ None, description='Optional catalog allowlist (exact catalog names)'
1159
+ )
1160
+ exclude_catalogs: list[str] | None = Field(
1161
+ [], description='Catalog denylist (exact catalog names)'
1162
+ )
1163
+ include_schemas: list[str] | None = Field(
1164
+ None,
1165
+ description='Optional schema allowlist. Accepted forms: schema or catalog.schema',
1166
+ )
1167
+ exclude_schemas: list[str] | None = Field(
1168
+ ['information_schema'],
1169
+ description='Schema denylist. Accepted forms: schema or catalog.schema',
1170
+ )
1171
+ include_tables: list[str] | None = Field(
1172
+ None,
1173
+ description='Optional table allowlist. Accepted forms: table, schema.table, or catalog.schema.table',
1174
+ )
1175
+ table_limit_per_schema: int | None = Field(
1176
+ None,
1177
+ description='Optional cap on number of Unity Catalog tables extracted per schema',
1178
+ ge=1,
1179
+ )
1180
+ include_hive_metastore: bool | None = Field(
1181
+ False, description='Include hive_metastore catalog in extraction'
1182
+ )
1183
+
1184
+
1185
+ class DatabricksOptionalExtraction(BaseModel):
1186
+ """
1187
+ Databricks Unity Catalog extraction feature flags.
1188
+ """
1189
+
1190
+ model_config = ConfigDict(
1191
+ extra='forbid',
1192
+ )
1193
+ include_table_lineage: bool | None = Field(
1194
+ True,
1195
+ description='Include table-level lineage links between Unity Catalog tables',
1196
+ )
1197
+ include_column_lineage: bool | None = Field(
1198
+ False, description='Attempt to fetch column-level lineage metadata'
1199
+ )
1200
+ include_notebooks: bool | None = Field(
1201
+ False, description='Extract workspace notebook metadata as additional assets'
1202
+ )
1203
+ include_pipelines: bool | None = Field(
1204
+ False,
1205
+ description='Extract Delta Live Tables pipeline metadata as additional assets',
1206
+ )
1207
+
1208
+
1209
+ class DatabricksOptional(BaseModel):
1210
+ model_config = ConfigDict(
1211
+ extra='forbid',
1212
+ )
1213
+ connection: DatabricksOptionalConnection | None = None
1214
+ scope: DatabricksOptionalScope | None = None
1215
+ extraction: DatabricksOptionalExtraction | None = None
1216
+
1217
+
1218
+ class SnowflakeAuthenticationType(StrEnum):
1219
+ """
1220
+ Snowflake authentication type
1221
+ """
1222
+
1223
+ DEFAULT_AUTHENTICATOR = 'DEFAULT_AUTHENTICATOR'
1224
+ EXTERNAL_BROWSER_AUTHENTICATOR = 'EXTERNAL_BROWSER_AUTHENTICATOR'
1225
+ KEY_PAIR_AUTHENTICATOR = 'KEY_PAIR_AUTHENTICATOR'
1226
+ OAUTH_AUTHENTICATOR_TOKEN = 'OAUTH_AUTHENTICATOR_TOKEN'
1227
+
1228
+
1229
+ class SnowflakeRequiredDefaultAuthenticator(BaseModel):
1230
+ model_config = ConfigDict(
1231
+ extra='forbid',
1232
+ )
1233
+ authentication_type: Literal['DEFAULT_AUTHENTICATOR']
1234
+ account_id: str = Field(
1235
+ ...,
1236
+ description='Snowflake account identifier (for example, xy12345.us-east-2.aws or LMAUONV-ONE_DATA_DEV)',
1237
+ )
1238
+
1239
+
1240
+ class SnowflakeRequiredExternalBrowserAuthenticator(BaseModel):
1241
+ model_config = ConfigDict(
1242
+ extra='forbid',
1243
+ )
1244
+ authentication_type: Literal['EXTERNAL_BROWSER_AUTHENTICATOR']
1245
+ account_id: str = Field(
1246
+ ...,
1247
+ description='Snowflake account identifier (for example, xy12345.us-east-2.aws or LMAUONV-ONE_DATA_DEV)',
1248
+ )
1249
+
1250
+
1251
+ class SnowflakeRequiredKeyPairAuthenticator(BaseModel):
1252
+ model_config = ConfigDict(
1253
+ extra='forbid',
1254
+ )
1255
+ authentication_type: Literal['KEY_PAIR_AUTHENTICATOR']
1256
+ account_id: str = Field(
1257
+ ...,
1258
+ description='Snowflake account identifier (for example, xy12345.us-east-2.aws or LMAUONV-ONE_DATA_DEV)',
1259
+ )
1260
+
1261
+
1262
+ class SnowflakeRequiredOauthAuthenticatorToken(BaseModel):
1263
+ model_config = ConfigDict(
1264
+ extra='forbid',
1265
+ )
1266
+ authentication_type: Literal['OAUTH_AUTHENTICATOR_TOKEN']
1267
+ account_id: str = Field(
1268
+ ...,
1269
+ description='Snowflake account identifier (for example, xy12345.us-east-2.aws or LMAUONV-ONE_DATA_DEV)',
1270
+ )
1271
+
1272
+
1273
+ class SnowflakeMaskedDefaultAuthenticator(BaseModel):
1274
+ model_config = ConfigDict(
1275
+ extra='forbid',
1276
+ )
1277
+ username: str = Field(..., description='Snowflake login username')
1278
+ password: str = Field(..., description='Snowflake login password')
1279
+
1280
+
1281
+ class SnowflakeMaskedExternalBrowserAuthenticator(BaseModel):
1282
+ model_config = ConfigDict(
1283
+ extra='forbid',
1284
+ )
1285
+ username: str = Field(..., description='Snowflake login username')
1286
+
1287
+
1288
+ class SnowflakeMaskedKeyPairAuthenticator(BaseModel):
1289
+ model_config = ConfigDict(
1290
+ extra='forbid',
1291
+ )
1292
+ username: str = Field(..., description='Snowflake login username')
1293
+ private_key: str = Field(
1294
+ ...,
1295
+ description='Snowflake private key PEM content. You can pass escaped newlines (\\n).',
1296
+ )
1297
+ private_key_password: str | None = Field(
1298
+ None,
1299
+ description='Password for encrypted private key PEM (optional when key is not encrypted).',
1300
+ )
1301
+
1302
+
1303
+ class SnowflakeMaskedOauthAuthenticatorToken(BaseModel):
1304
+ model_config = ConfigDict(
1305
+ extra='forbid',
1306
+ )
1307
+ username: str = Field(..., description='Snowflake login username')
1308
+ token: str = Field(
1309
+ ..., description='OAuth bearer token for Snowflake authentication'
1310
+ )
1311
+
1312
+
1313
+ class SnowflakeOptionalConnection(BaseModel):
1314
+ """
1315
+ Snowflake connection and session tuning options.
1316
+ """
1317
+
1318
+ model_config = ConfigDict(
1319
+ extra='forbid',
1320
+ )
1321
+ warehouse: str | None = Field(
1322
+ None, description='Snowflake warehouse to use for metadata and sampling queries'
1323
+ )
1324
+ role: str | None = Field(
1325
+ None, description='Snowflake role to use for metadata and sampling queries'
1326
+ )
1327
+ snowflake_domain: str | None = Field(
1328
+ 'snowflakecomputing.com',
1329
+ description='Snowflake domain suffix (use snowflakecomputing.cn for China regions).',
1330
+ )
1331
+ connect_timeout_seconds: int | None = Field(
1332
+ 15, description='Connection timeout in seconds', ge=1, le=300
1333
+ )
1334
+ connect_args: dict[str, Any] | None = Field(
1335
+ {},
1336
+ description='Additional snowflake.connector.connect keyword arguments (advanced usage).',
1337
+ )
1338
+
1339
+
1340
+ class SnowflakeOptionalScope(BaseModel):
1341
+ """
1342
+ Database, schema, and object selection scope.
1343
+ """
1344
+
1345
+ model_config = ConfigDict(
1346
+ extra='forbid',
1347
+ )
1348
+ database: str | None = Field(
1349
+ None,
1350
+ description='Single database to scan (optional when include_all_databases is true)',
1351
+ )
1352
+ include_all_databases: bool | None = Field(
1353
+ False, description='Scan all visible databases except excluded system databases'
1354
+ )
1355
+ exclude_databases: list[str] | None = Field(
1356
+ ['SNOWFLAKE', 'SNOWFLAKE_SAMPLE_DATA'],
1357
+ description='Database denylist (exact database names)',
1358
+ )
1359
+ include_schemas: list[str] | None = Field(
1360
+ None, description='Optional schema allowlist (exact schema names)'
1361
+ )
1362
+ exclude_schemas: list[str] | None = Field(
1363
+ ['INFORMATION_SCHEMA'], description='Schema denylist (exact schema names)'
1364
+ )
1365
+ include_tables: bool | None = Field(
1366
+ True, description='Include table assets in extraction'
1367
+ )
1368
+ include_views: bool | None = Field(
1369
+ True, description='Include view assets in extraction'
1370
+ )
1371
+ include_objects: list[str] | None = Field(
1372
+ None,
1373
+ description='Optional object allowlist. Accepted forms: schema.object or database.schema.object',
1374
+ )
1375
+ table_limit: int | None = Field(
1376
+ None, description='Optional cap on number of table/view assets extracted', ge=1
1377
+ )
1378
+
1379
+
1380
+ class SnowflakeOptionalExtraction(BaseModel):
1381
+ """
1382
+ Lineage extraction controls for Snowflake metadata ingestion.
1383
+ """
1384
+
1385
+ model_config = ConfigDict(
1386
+ extra='forbid',
1387
+ )
1388
+ start_time: AwareDatetime | None = Field(
1389
+ None, description='Optional lineage lower bound timestamp (ISO 8601).'
1390
+ )
1391
+ include_table_lineage: bool | None = Field(
1392
+ True,
1393
+ description='Include table-level lineage links when dependency metadata is accessible.',
1394
+ )
1395
+ include_view_lineage: bool | None = Field(
1396
+ True,
1397
+ description='Include view-to-table/view lineage links when dependency metadata is accessible.',
1398
+ )
1399
+
1400
+
1401
+ class SnowflakeOptional(BaseModel):
1402
+ model_config = ConfigDict(
1403
+ extra='forbid',
1404
+ )
1405
+ connection: SnowflakeOptionalConnection | None = None
1406
+ scope: SnowflakeOptionalScope | None = None
1407
+ extraction: SnowflakeOptionalExtraction | None = None
1408
+
1409
+
1410
+ class MongoDBDeployment(StrEnum):
1411
+ """
1412
+ MongoDB deployment mode
1413
+ """
1414
+
1415
+ ATLAS = 'ATLAS'
1416
+ ON_PREM = 'ON_PREM'
1417
+
1418
+
1419
+ class MongoDBRequiredAtlas(BaseModel):
1420
+ model_config = ConfigDict(
1421
+ extra='forbid',
1422
+ )
1423
+ deployment: Literal['ATLAS']
1424
+ cluster_host: str = Field(
1425
+ ...,
1426
+ description='Atlas SRV cluster host or full mongodb+srv:// URI (for example, cluster.abc123.mongodb.net or mongodb+srv://cluster.abc123.mongodb.net). If a full URI is supplied, the host is extracted and credentials are discarded — set them in masked fields instead.',
1427
+ min_length=1,
1428
+ )
1429
+
1430
+
1431
+ class MongoDBRequiredOnPrem(BaseModel):
1432
+ model_config = ConfigDict(
1433
+ extra='forbid',
1434
+ )
1435
+ deployment: Literal['ON_PREM']
1436
+ host: str = Field(..., description='On-prem MongoDB host endpoint')
1437
+ port: int = Field(..., description='On-prem MongoDB TCP port', ge=1, le=65535)
1438
+
1439
+
1440
+ class MongoDBMaskedUsernamePassword(BaseModel):
1441
+ model_config = ConfigDict(
1442
+ extra='forbid',
1443
+ )
1444
+ username: str = Field(..., description='MongoDB login username')
1445
+ password: str = Field(..., description='MongoDB login password')
1446
+
1447
+
1448
+ class MongoDBMaskedNone(BaseModel):
1449
+ """
1450
+ Use when the MongoDB endpoint allows anonymous/no-auth access.
1451
+ """
1452
+
1453
+ model_config = ConfigDict(
1454
+ extra='forbid',
1455
+ )
1456
+
1457
+
1458
+ class MongoDBAuthMechanism(StrEnum):
1459
+ """
1460
+ MongoDB authentication mechanism
1461
+ """
1462
+
1463
+ DEFAULT = 'DEFAULT'
1464
+ SCRAM_SHA_1 = 'SCRAM-SHA-1'
1465
+ SCRAM_SHA_256 = 'SCRAM-SHA-256'
1466
+ MONGODB_AWS = 'MONGODB-AWS'
1467
+ MONGODB_X509 = 'MONGODB-X509'
1468
+ GSSAPI = 'GSSAPI'
1469
+ PLAIN = 'PLAIN'
1470
+
1471
+
1472
+ class MongoDBOptionalConnection(BaseModel):
1473
+ """
1474
+ MongoDB connection and authentication tuning options.
1475
+ """
1476
+
1477
+ model_config = ConfigDict(
1478
+ extra='forbid',
1479
+ )
1480
+ auth_mechanism: MongoDBAuthMechanism | None = None
1481
+ auth_source: str | None = Field(
1482
+ None, description='Authentication database/source (for example, admin)'
1483
+ )
1484
+ app_name: str | None = Field(
1485
+ None, description='MongoDB appName (Atlas and driver telemetry label)'
1486
+ )
1487
+ tls: bool | None = Field(
1488
+ None, description='Enable TLS for on-prem connections when required'
1489
+ )
1490
+ replica_set: str | None = Field(
1491
+ None, description='Replica set name for on-prem deployments'
1492
+ )
1493
+ direct_connection: bool | None = Field(
1494
+ None,
1495
+ description='Connect directly to a single host instead of topology discovery',
1496
+ )
1497
+ connect_timeout_ms: int | None = Field(
1498
+ 10000,
1499
+ description='MongoDB connection timeout in milliseconds',
1500
+ ge=100,
1501
+ le=120000,
1502
+ )
1503
+ options: dict[str, Any] | None = Field(
1504
+ {},
1505
+ description='Additional pymongo.MongoClient keyword arguments (advanced usage).',
1506
+ )
1507
+
1508
+
1509
+ class MongoDBOptionalScope(BaseModel):
1510
+ """
1511
+ MongoDB database and collection selection scope.
1512
+ """
1513
+
1514
+ model_config = ConfigDict(
1515
+ extra='forbid',
1516
+ )
1517
+ database: str | None = Field(
1518
+ None,
1519
+ description='Single database to scan (optional when include_all_databases is true)',
1520
+ )
1521
+ include_all_databases: bool | None = Field(
1522
+ True, description='Scan all visible databases except excluded system databases'
1523
+ )
1524
+ exclude_databases: list[str] | None = Field(
1525
+ ['admin', 'config', 'local'],
1526
+ description='Database denylist (exact database names)',
1527
+ )
1528
+ include_collections: list[str] | None = Field(
1529
+ None,
1530
+ description='Optional collection allowlist. Accepted forms: collection or database.collection',
1531
+ )
1532
+ exclude_collections: list[str] | None = Field(
1533
+ None,
1534
+ description='Optional collection denylist. Accepted forms: collection or database.collection',
1535
+ )
1536
+ include_system_collections: bool | None = Field(
1537
+ False, description='Include system.* collections when true'
1538
+ )
1539
+ collection_limit: int | None = Field(
1540
+ None,
1541
+ description='Optional cap on number of collections extracted per database',
1542
+ ge=1,
1543
+ )
1544
+
1545
+
1546
+ class MongoDBOptional(BaseModel):
1547
+ model_config = ConfigDict(
1548
+ extra='forbid',
1549
+ )
1550
+ connection: MongoDBOptionalConnection | None = None
1551
+ scope: MongoDBOptionalScope | None = None
1552
+
1553
+
1554
+ class PowerBIAuthMode(StrEnum):
1555
+ """
1556
+ PowerBI authentication mode
1557
+ """
1558
+
1559
+ SERVICE_PRINCIPAL = 'SERVICE_PRINCIPAL'
1560
+ ACCESS_TOKEN = 'ACCESS_TOKEN'
1561
+
1562
+
1563
+ class PowerBIRequiredServicePrincipal(BaseModel):
1564
+ model_config = ConfigDict(
1565
+ extra='forbid',
1566
+ )
1567
+ auth_mode: Literal['SERVICE_PRINCIPAL']
1568
+ tenant_id: str = Field(
1569
+ ..., description='Azure tenant identifier', pattern='^[0-9a-fA-F-]{36}$'
1570
+ )
1571
+ client_id: str = Field(
1572
+ ..., description='Azure app client identifier', pattern='^[0-9a-fA-F-]{36}$'
1573
+ )
1574
+
1575
+
1576
+ class PowerBIRequiredAccessToken(BaseModel):
1577
+ model_config = ConfigDict(
1578
+ extra='forbid',
1579
+ )
1580
+ auth_mode: Literal['ACCESS_TOKEN']
1581
+
1582
+
1583
+ class PowerBIMaskedClientSecret(BaseModel):
1584
+ model_config = ConfigDict(
1585
+ extra='forbid',
1586
+ )
1587
+ client_secret: str = Field(
1588
+ ..., description='Azure app client secret for service principal auth'
1589
+ )
1590
+
1591
+
1592
+ class PowerBIMaskedAccessToken(BaseModel):
1593
+ model_config = ConfigDict(
1594
+ extra='forbid',
1595
+ )
1596
+ access_token: str = Field(..., description='Bearer token for PowerBI API access')
1597
+
1598
+
1599
+ class PowerBIOptionalConnection(BaseModel):
1600
+ """
1601
+ PowerBI API endpoint and timeout controls.
1602
+ """
1603
+
1604
+ model_config = ConfigDict(
1605
+ extra='forbid',
1606
+ )
1607
+ authority_url: AnyUrl | None = Field(
1608
+ 'https://login.microsoftonline.com',
1609
+ description='Authority base URL for Microsoft Entra token issuance',
1610
+ )
1611
+ api_base_url: AnyUrl | None = Field(
1612
+ 'https://api.powerbi.com/v1.0/myorg', description='PowerBI REST API base URL'
1613
+ )
1614
+ timeout_seconds: int | None = Field(
1615
+ 30, description='HTTP timeout for PowerBI API calls', ge=5, le=300
1616
+ )
1617
+
1618
+
1619
+ class PowerBIOptionalScope(BaseModel):
1620
+ """
1621
+ Workspace scope controls for PowerBI ingestion.
1622
+ """
1623
+
1624
+ model_config = ConfigDict(
1625
+ extra='forbid',
1626
+ )
1627
+ workspace_ids: list[str] | None = Field(
1628
+ None, description='Optional allowlist of workspace IDs to scan'
1629
+ )
1630
+ workspace_names: list[str] | None = Field(
1631
+ None, description='Optional allowlist of workspace names to scan'
1632
+ )
1633
+ include_personal_workspaces: bool | None = Field(
1634
+ False, description='Include personal workspaces when true'
1635
+ )
1636
+
1637
+
1638
+ class PowerBIOptionalExtraction(BaseModel):
1639
+ """
1640
+ Feature flags that control PowerBI entities to extract.
1641
+ """
1642
+
1643
+ model_config = ConfigDict(
1644
+ extra='forbid',
1645
+ )
1646
+ extract_ownership: bool | None = Field(
1647
+ False, description='Extract workspace/report/dataset owner metadata'
1648
+ )
1649
+ extract_workspaces_to_containers: bool | None = Field(
1650
+ True, description='Emit workspace metadata suitable for container grouping'
1651
+ )
1652
+ extract_datasets_to_containers: bool | None = Field(
1653
+ False, description='Emit dataset metadata suitable for container grouping'
1654
+ )
1655
+ extract_dashboards: bool | None = Field(
1656
+ True, description='Extract PowerBI dashboards'
1657
+ )
1658
+ extract_reports: bool | None = Field(True, description='Extract PowerBI reports')
1659
+ extract_dataset_schema: bool | None = Field(
1660
+ True, description='Attempt to extract PowerBI dataset table schema metadata'
1661
+ )
1662
+
1663
+
1664
+ class PowerBIOptional(BaseModel):
1665
+ model_config = ConfigDict(
1666
+ extra='forbid',
1667
+ )
1668
+ connection: PowerBIOptionalConnection | None = None
1669
+ scope: PowerBIOptionalScope | None = None
1670
+ extraction: PowerBIOptionalExtraction | None = None
1671
+
1672
+
1673
+ class TableauAuthMode(StrEnum):
1674
+ """
1675
+ Tableau authentication mode
1676
+ """
1677
+
1678
+ USERNAME_PASSWORD = 'USERNAME_PASSWORD'
1679
+ PERSONAL_ACCESS_TOKEN = 'PERSONAL_ACCESS_TOKEN'
1680
+
1681
+
1682
+ class TableauRequiredUsernamePassword(BaseModel):
1683
+ model_config = ConfigDict(
1684
+ extra='forbid',
1685
+ )
1686
+ auth_mode: Literal['USERNAME_PASSWORD']
1687
+ connect_uri: AnyUrl = Field(
1688
+ ...,
1689
+ description='Tableau host URL (for example, https://dub01.online.tableau.com)',
1690
+ )
1691
+ site: str = Field(
1692
+ ...,
1693
+ description='Tableau site content URL. Use empty string for the Default site on Tableau Server.',
1694
+ )
1695
+
1696
+
1697
+ class TableauRequiredPersonalAccessToken(BaseModel):
1698
+ model_config = ConfigDict(
1699
+ extra='forbid',
1700
+ )
1701
+ auth_mode: Literal['PERSONAL_ACCESS_TOKEN']
1702
+ connect_uri: AnyUrl = Field(
1703
+ ...,
1704
+ description='Tableau host URL (for example, https://dub01.online.tableau.com)',
1705
+ )
1706
+ site: str = Field(
1707
+ ...,
1708
+ description='Tableau site content URL. Use empty string for the Default site on Tableau Server.',
1709
+ )
1710
+ token_name: str = Field(..., description='Tableau personal access token name')
1711
+
1712
+
1713
+ class TableauMaskedUsernamePassword(BaseModel):
1714
+ model_config = ConfigDict(
1715
+ extra='forbid',
1716
+ )
1717
+ username: str = Field(..., description='Tableau login username')
1718
+ password: str = Field(..., description='Tableau login password')
1719
+
1720
+
1721
+ class TableauMaskedPersonalAccessToken(BaseModel):
1722
+ model_config = ConfigDict(
1723
+ extra='forbid',
1724
+ )
1725
+ token_value: str = Field(..., description='Tableau personal access token value')
1726
+
1727
+
1728
+ class TableauOptionalConnection(BaseModel):
1729
+ """
1730
+ Tableau API connection and retry settings.
1731
+ """
1732
+
1733
+ model_config = ConfigDict(
1734
+ extra='forbid',
1735
+ )
1736
+ max_retries: int | None = Field(
1737
+ 3,
1738
+ description='Maximum retries for transient Tableau API request failures',
1739
+ ge=0,
1740
+ le=10,
1741
+ )
1742
+ ssl_verify: bool | str | None = Field(
1743
+ True,
1744
+ description='Verify SSL certificates. Provide a PEM bundle path string for custom certs.',
1745
+ )
1746
+ session_trust_env: bool | None = Field(
1747
+ False,
1748
+ description='When true, allow requests session proxy/environment settings',
1749
+ )
1750
+ timeout_seconds: int | None = Field(
1751
+ 30, description='HTTP timeout in seconds for Tableau requests', ge=5, le=300
1752
+ )
1753
+
1754
+
1755
+ class TableauOptionalScope(BaseModel):
1756
+ """
1757
+ Optional Tableau project/workbook/datasource scope filters.
1758
+ """
1759
+
1760
+ model_config = ConfigDict(
1761
+ extra='forbid',
1762
+ )
1763
+ project_names: list[str] | None = Field(
1764
+ None, description='Optional Tableau project allowlist (exact names)'
1765
+ )
1766
+ workbook_names: list[str] | None = Field(
1767
+ None, description='Optional Tableau workbook allowlist (exact names)'
1768
+ )
1769
+ datasource_names: list[str] | None = Field(
1770
+ None, description='Optional Tableau datasource allowlist (exact names)'
1771
+ )
1772
+ include_workbooks: bool | None = Field(
1773
+ True, description='Include workbook assets in extraction'
1774
+ )
1775
+ include_datasources: bool | None = Field(
1776
+ True, description='Include datasource assets in extraction'
1777
+ )
1778
+
1779
+
1780
+ class TableauOptionalExtraction(BaseModel):
1781
+ """
1782
+ Tableau metadata extraction feature flags.
1783
+ """
1784
+
1785
+ model_config = ConfigDict(
1786
+ extra='forbid',
1787
+ )
1788
+ ingest_tags: bool | None = Field(
1789
+ False, description='Extract Tableau tags into asset metadata'
1790
+ )
1791
+ ingest_owner: bool | None = Field(
1792
+ False, description='Extract Tableau owner metadata into assets'
1793
+ )
1794
+ extract_usage_stats: bool | None = Field(
1795
+ False, description='Extract Tableau usage statistics when accessible'
1796
+ )
1797
+
1798
+
1799
+ class TableauOptional(BaseModel):
1800
+ model_config = ConfigDict(
1801
+ extra='forbid',
1802
+ )
1803
+ connection: TableauOptionalConnection | None = None
1804
+ scope: TableauOptionalScope | None = None
1805
+ extraction: TableauOptionalExtraction | None = None
1806
+
1807
+
1808
+ class CoreInput(BaseModel):
1809
+ type: AssetType
1810
+ detectors: list[Detector] | None = Field(
1811
+ None, description='Detectors to run on ingested content'
1812
+ )
1813
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
1814
+ None,
1815
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
1816
+ )
1817
+ sampling: SamplingConfig
1818
+ resources: ResourceOverrides | None = None
1819
+
1820
+
1821
+ class Type(StrEnum):
1822
+ """
1823
+ Type of the asset or source
1824
+ """
1825
+
1826
+ WORDPRESS = 'WORDPRESS'
1827
+ SLACK = 'SLACK'
1828
+ S3_COMPATIBLE_STORAGE = 'S3_COMPATIBLE_STORAGE'
1829
+ AZURE_BLOB_STORAGE = 'AZURE_BLOB_STORAGE'
1830
+ GOOGLE_CLOUD_STORAGE = 'GOOGLE_CLOUD_STORAGE'
1831
+ POSTGRESQL = 'POSTGRESQL'
1832
+ MYSQL = 'MYSQL'
1833
+ MSSQL = 'MSSQL'
1834
+ ORACLE = 'ORACLE'
1835
+ HIVE = 'HIVE'
1836
+ DATABRICKS = 'DATABRICKS'
1837
+ SNOWFLAKE = 'SNOWFLAKE'
1838
+ MONGODB = 'MONGODB'
1839
+ NEO4J = 'NEO4J'
1840
+ POWERBI = 'POWERBI'
1841
+ TABLEAU = 'TABLEAU'
1842
+ CONFLUENCE = 'CONFLUENCE'
1843
+ JIRA = 'JIRA'
1844
+ SERVICEDESK = 'SERVICEDESK'
1845
+
1846
+
1847
+ class SlackInput(CoreInput):
1848
+ type: Literal['SLACK'] = Field('SLACK', description='Type of the asset or source')
1849
+ required: SlackRequired
1850
+ masked: SlackMaskedBotToken | SlackMaskedUserToken | SlackMaskedToken = Field(
1851
+ ..., title='SlackMasked'
1852
+ )
1853
+ optional: SlackOptional | None = None
1854
+ detectors: list[Detector] | None = Field(
1855
+ None, description='Detectors to run on ingested content'
1856
+ )
1857
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
1858
+ None,
1859
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
1860
+ )
1861
+ sampling: SamplingConfig
1862
+ resources: ResourceOverrides | None = None
1863
+
1864
+
1865
+ class S3CompatibleStorageInput(CoreInput):
1866
+ type: Literal['S3_COMPATIBLE_STORAGE'] = Field(
1867
+ 'S3_COMPATIBLE_STORAGE', description='Type of the asset or source'
1868
+ )
1869
+ required: S3CompatibleStorageRequired
1870
+ masked: S3CompatibleStorageMasked | None = None
1871
+ optional: S3CompatibleStorageOptional | None = None
1872
+ detectors: list[Detector] | None = Field(
1873
+ None, description='Detectors to run on ingested content'
1874
+ )
1875
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
1876
+ None,
1877
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
1878
+ )
1879
+ sampling: SamplingConfig
1880
+ resources: ResourceOverrides | None = None
1881
+
1882
+
1883
+ class AzureBlobStorageInput(CoreInput):
1884
+ type: Literal['AZURE_BLOB_STORAGE'] = Field(
1885
+ 'AZURE_BLOB_STORAGE', description='Type of the asset or source'
1886
+ )
1887
+ required: AzureBlobStorageRequired
1888
+ masked: AzureBlobStorageMasked | None = None
1889
+ optional: AzureBlobStorageOptional | None = None
1890
+ detectors: list[Detector] | None = Field(
1891
+ None, description='Detectors to run on ingested content'
1892
+ )
1893
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
1894
+ None,
1895
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
1896
+ )
1897
+ sampling: SamplingConfig
1898
+ resources: ResourceOverrides | None = None
1899
+
1900
+
1901
+ class GoogleCloudStorageInput(CoreInput):
1902
+ type: Literal['GOOGLE_CLOUD_STORAGE'] = Field(
1903
+ 'GOOGLE_CLOUD_STORAGE', description='Type of the asset or source'
1904
+ )
1905
+ required: GoogleCloudStorageRequired
1906
+ masked: GoogleCloudStorageMasked | None = None
1907
+ optional: GoogleCloudStorageOptional | None = None
1908
+ detectors: list[Detector] | None = Field(
1909
+ None, description='Detectors to run on ingested content'
1910
+ )
1911
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
1912
+ None,
1913
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
1914
+ )
1915
+ sampling: SamplingConfig
1916
+ resources: ResourceOverrides | None = None
1917
+
1918
+
1919
+ class WordPressInput(CoreInput):
1920
+ type: Literal['WORDPRESS'] = Field(
1921
+ 'WORDPRESS', description='Type of the asset or source'
1922
+ )
1923
+ required: WordPressRequired
1924
+ masked: WordPressMasked
1925
+ optional: WordPressOptional | None = None
1926
+ detectors: list[Detector] | None = Field(
1927
+ None, description='Detectors to run on ingested content'
1928
+ )
1929
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
1930
+ None,
1931
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
1932
+ )
1933
+ sampling: SamplingConfig
1934
+ resources: ResourceOverrides | None = None
1935
+
1936
+
1937
+ class PostgreSQLInput(CoreInput):
1938
+ type: Literal['POSTGRESQL'] = Field(
1939
+ 'POSTGRESQL', description='Type of the asset or source'
1940
+ )
1941
+ required: PostgreSQLRequired
1942
+ masked: PostgreSQLMasked
1943
+ optional: PostgreSQLOptional | None = None
1944
+ detectors: list[Detector] | None = Field(
1945
+ None, description='Detectors to run on ingested content'
1946
+ )
1947
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
1948
+ None,
1949
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
1950
+ )
1951
+ sampling: SamplingConfig
1952
+ resources: ResourceOverrides | None = None
1953
+
1954
+
1955
+ class MySQLInput(CoreInput):
1956
+ type: Literal['MYSQL'] = Field('MYSQL', description='Type of the asset or source')
1957
+ required: MySQLRequired
1958
+ masked: MySQLMasked
1959
+ optional: MySQLOptional | None = None
1960
+ detectors: list[Detector] | None = Field(
1961
+ None, description='Detectors to run on ingested content'
1962
+ )
1963
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
1964
+ None,
1965
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
1966
+ )
1967
+ sampling: SamplingConfig
1968
+ resources: ResourceOverrides | None = None
1969
+
1970
+
1971
+ class MSSQLInput(CoreInput):
1972
+ type: Literal['MSSQL'] = Field('MSSQL', description='Type of the asset or source')
1973
+ required: MSSQLRequired
1974
+ masked: MSSQLMasked
1975
+ optional: MSSQLOptional | None = None
1976
+ detectors: list[Detector] | None = Field(
1977
+ None, description='Detectors to run on ingested content'
1978
+ )
1979
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
1980
+ None,
1981
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
1982
+ )
1983
+ sampling: SamplingConfig
1984
+ resources: ResourceOverrides | None = None
1985
+
1986
+
1987
+ class OracleInput(CoreInput):
1988
+ type: Literal['ORACLE'] = Field('ORACLE', description='Type of the asset or source')
1989
+ required: OracleRequired
1990
+ masked: OracleMasked
1991
+ optional: OracleOptional | None = None
1992
+ detectors: list[Detector] | None = Field(
1993
+ None, description='Detectors to run on ingested content'
1994
+ )
1995
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
1996
+ None,
1997
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
1998
+ )
1999
+ sampling: SamplingConfig
2000
+ resources: ResourceOverrides | None = None
2001
+
2002
+
2003
+ class HiveInput(CoreInput):
2004
+ type: Literal['HIVE'] = Field('HIVE', description='Type of the asset or source')
2005
+ required: HiveRequired
2006
+ masked: HiveMasked
2007
+ optional: HiveOptional | None = None
2008
+ detectors: list[Detector] | None = Field(
2009
+ None, description='Detectors to run on ingested content'
2010
+ )
2011
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2012
+ None,
2013
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2014
+ )
2015
+ sampling: SamplingConfig
2016
+ resources: ResourceOverrides | None = None
2017
+
2018
+
2019
+ class DatabricksInput(CoreInput):
2020
+ type: Literal['DATABRICKS'] = Field(
2021
+ 'DATABRICKS', description='Type of the asset or source'
2022
+ )
2023
+ required: DatabricksRequiredPat | DatabricksRequiredServicePrincipal = Field(
2024
+ ..., title='DatabricksRequired'
2025
+ )
2026
+ masked: DatabricksMaskedPat | DatabricksMaskedServicePrincipal = Field(
2027
+ ..., title='DatabricksMasked'
2028
+ )
2029
+ optional: DatabricksOptional | None = None
2030
+ detectors: list[Detector] | None = Field(
2031
+ None, description='Detectors to run on ingested content'
2032
+ )
2033
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2034
+ None,
2035
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2036
+ )
2037
+ sampling: SamplingConfig
2038
+ resources: ResourceOverrides | None = None
2039
+
2040
+
2041
+ class SnowflakeInput(CoreInput):
2042
+ type: Literal['SNOWFLAKE'] = Field(
2043
+ 'SNOWFLAKE', description='Type of the asset or source'
2044
+ )
2045
+ required: (
2046
+ SnowflakeRequiredDefaultAuthenticator
2047
+ | SnowflakeRequiredExternalBrowserAuthenticator
2048
+ | SnowflakeRequiredKeyPairAuthenticator
2049
+ | SnowflakeRequiredOauthAuthenticatorToken
2050
+ ) = Field(..., title='SnowflakeRequired')
2051
+ masked: (
2052
+ SnowflakeMaskedDefaultAuthenticator
2053
+ | SnowflakeMaskedExternalBrowserAuthenticator
2054
+ | SnowflakeMaskedKeyPairAuthenticator
2055
+ | SnowflakeMaskedOauthAuthenticatorToken
2056
+ ) = Field(..., title='SnowflakeMasked')
2057
+ optional: SnowflakeOptional | None = None
2058
+ detectors: list[Detector] | None = Field(
2059
+ None, description='Detectors to run on ingested content'
2060
+ )
2061
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2062
+ None,
2063
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2064
+ )
2065
+ sampling: SamplingConfig
2066
+ resources: ResourceOverrides | None = None
2067
+
2068
+
2069
+ class MongoDBInput(CoreInput):
2070
+ type: Literal['MONGODB'] = Field(
2071
+ 'MONGODB', description='Type of the asset or source'
2072
+ )
2073
+ required: MongoDBRequiredAtlas | MongoDBRequiredOnPrem = Field(
2074
+ ..., title='MongoDBRequired'
2075
+ )
2076
+ masked: MongoDBMaskedUsernamePassword | MongoDBMaskedNone = Field(
2077
+ ..., title='MongoDBMasked'
2078
+ )
2079
+ optional: MongoDBOptional | None = None
2080
+ detectors: list[Detector] | None = Field(
2081
+ None, description='Detectors to run on ingested content'
2082
+ )
2083
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2084
+ None,
2085
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2086
+ )
2087
+ sampling: SamplingConfig
2088
+ resources: ResourceOverrides | None = None
2089
+
2090
+
2091
+ class Neo4jRequired(BaseModel):
2092
+ """
2093
+ Neo4j connection endpoint. Accepts bolt://, neo4j://, or neo4j+s:// URIs.
2094
+ """
2095
+
2096
+ model_config = ConfigDict(
2097
+ extra='forbid',
2098
+ )
2099
+ uri: str = Field(
2100
+ ...,
2101
+ description='Bolt or Neo4j URI (e.g. bolt://localhost:7687 or neo4j+s://abc123.databases.neo4j.io)',
2102
+ )
2103
+ database: str | None = Field(
2104
+ None,
2105
+ description='Target database name (defaults to "neo4j"). Multi-database requires Neo4j 4.0+.',
2106
+ )
2107
+
2108
+
2109
+ class Neo4jMaskedUsernamePassword(BaseModel):
2110
+ """
2111
+ Neo4j basic auth credentials.
2112
+ """
2113
+
2114
+ model_config = ConfigDict(
2115
+ extra='forbid',
2116
+ )
2117
+ username: str = Field(..., description='Neo4j username (typically "neo4j")')
2118
+ password: str = Field(..., description='Neo4j password')
2119
+
2120
+
2121
+ class Neo4jMaskedNone(BaseModel):
2122
+ """
2123
+ No authentication (local dev / anonymous access).
2124
+ """
2125
+
2126
+ model_config = ConfigDict(
2127
+ extra='forbid',
2128
+ )
2129
+
2130
+
2131
+ class TrustStrategy(StrEnum):
2132
+ """
2133
+ Certificate trust strategy. TRUST_ALL_CERTIFICATES is useful for self-signed certs in dev.
2134
+ """
2135
+
2136
+ TRUST_ALL_CERTIFICATES = 'TRUST_ALL_CERTIFICATES'
2137
+ TRUST_SYSTEM_CA_SIGNED_CERTIFICATES = 'TRUST_SYSTEM_CA_SIGNED_CERTIFICATES'
2138
+
2139
+
2140
+ class Neo4jOptionalConnection(BaseModel):
2141
+ """
2142
+ Neo4j driver connection tuning options.
2143
+ """
2144
+
2145
+ model_config = ConfigDict(
2146
+ extra='forbid',
2147
+ )
2148
+ connection_timeout_ms: int | None = Field(
2149
+ 30000, description='Driver connection timeout in milliseconds.', ge=1000
2150
+ )
2151
+ max_connection_pool_size: int | None = Field(
2152
+ 10, description='Maximum number of connections in the driver pool.', ge=1
2153
+ )
2154
+ encrypted: bool | None = Field(
2155
+ None, description='Force encrypted connection (overrides URI scheme detection).'
2156
+ )
2157
+ trust_strategy: TrustStrategy | None = Field(
2158
+ None,
2159
+ description='Certificate trust strategy. TRUST_ALL_CERTIFICATES is useful for self-signed certs in dev.',
2160
+ )
2161
+
2162
+
2163
+ class Neo4jOptionalScope(BaseModel):
2164
+ """
2165
+ Controls which node labels and relationships are included.
2166
+ """
2167
+
2168
+ model_config = ConfigDict(
2169
+ extra='forbid',
2170
+ )
2171
+ include_labels: list[str] | None = Field(
2172
+ None,
2173
+ description='Allowlist of node labels to scan. If empty, all labels are included.',
2174
+ )
2175
+ exclude_labels: list[str] | None = Field(
2176
+ None, description='Denylist of node labels to skip (case-sensitive).'
2177
+ )
2178
+ node_limit_per_label: int | None = Field(
2179
+ None,
2180
+ description='Maximum number of assets (node labels) to emit per extraction run.',
2181
+ ge=1,
2182
+ )
2183
+ include_relationships: bool | None = Field(
2184
+ True,
2185
+ description='When true, relationship edges between labels are resolved and stored as asset links.',
2186
+ )
2187
+
2188
+
2189
+ class Neo4jOptional(BaseModel):
2190
+ model_config = ConfigDict(
2191
+ extra='forbid',
2192
+ )
2193
+ connection: Neo4jOptionalConnection | None = None
2194
+ scope: Neo4jOptionalScope | None = None
2195
+
2196
+
2197
+ class Neo4jInput(CoreInput):
2198
+ type: Literal['NEO4J'] = Field('NEO4J', description='Type of the asset or source')
2199
+ required: Neo4jRequired
2200
+ masked: Neo4jMaskedUsernamePassword | Neo4jMaskedNone = Field(
2201
+ ..., title='Neo4jMasked'
2202
+ )
2203
+ optional: Neo4jOptional | None = None
2204
+ detectors: list[Detector] | None = Field(
2205
+ None, description='Detectors to run on ingested content'
2206
+ )
2207
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2208
+ None,
2209
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2210
+ )
2211
+ sampling: SamplingConfig
2212
+ resources: ResourceOverrides | None = None
2213
+
2214
+
2215
+ class PowerBIInput(CoreInput):
2216
+ type: Literal['POWERBI'] = Field(
2217
+ 'POWERBI', description='Type of the asset or source'
2218
+ )
2219
+ required: PowerBIRequiredServicePrincipal | PowerBIRequiredAccessToken = Field(
2220
+ ..., title='PowerBIRequired'
2221
+ )
2222
+ masked: PowerBIMaskedClientSecret | PowerBIMaskedAccessToken = Field(
2223
+ ..., title='PowerBIMasked'
2224
+ )
2225
+ optional: PowerBIOptional | None = None
2226
+ detectors: list[Detector] | None = Field(
2227
+ None, description='Detectors to run on ingested content'
2228
+ )
2229
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2230
+ None,
2231
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2232
+ )
2233
+ sampling: SamplingConfig
2234
+ resources: ResourceOverrides | None = None
2235
+
2236
+
2237
+ class TableauInput(CoreInput):
2238
+ type: Literal['TABLEAU'] = Field(
2239
+ 'TABLEAU', description='Type of the asset or source'
2240
+ )
2241
+ required: TableauRequiredUsernamePassword | TableauRequiredPersonalAccessToken = (
2242
+ Field(..., title='TableauRequired')
2243
+ )
2244
+ masked: TableauMaskedUsernamePassword | TableauMaskedPersonalAccessToken = Field(
2245
+ ..., title='TableauMasked'
2246
+ )
2247
+ optional: TableauOptional | None = None
2248
+ detectors: list[Detector] | None = Field(
2249
+ None, description='Detectors to run on ingested content'
2250
+ )
2251
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2252
+ None,
2253
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2254
+ )
2255
+ sampling: SamplingConfig
2256
+ resources: ResourceOverrides | None = None
2257
+
2258
+
2259
+ class ConfluenceRequired(BaseModel):
2260
+ model_config = ConfigDict(
2261
+ extra='forbid',
2262
+ )
2263
+ base_url: AnyUrl = Field(
2264
+ ...,
2265
+ description='Confluence Cloud tenant URL (for example, https://your-domain.atlassian.net)',
2266
+ )
2267
+ account_email: EmailStr = Field(
2268
+ ...,
2269
+ description='Atlassian account email used with API token for Basic authentication',
2270
+ )
2271
+
2272
+
2273
+ class ConfluenceMasked(BaseModel):
2274
+ model_config = ConfigDict(
2275
+ extra='forbid',
2276
+ )
2277
+ api_token: str = Field(..., description='Atlassian API token for Confluence Cloud')
2278
+
2279
+
2280
+ class ConfluenceOptionalConnection(BaseModel):
2281
+ """
2282
+ HTTP and retry settings for Confluence API calls.
2283
+ """
2284
+
2285
+ model_config = ConfigDict(
2286
+ extra='forbid',
2287
+ )
2288
+ request_timeout_seconds: float | None = Field(
2289
+ 30, description='HTTP request timeout for Confluence API calls', ge=1.0
2290
+ )
2291
+ rate_limit_delay_seconds: float | None = Field(
2292
+ 0,
2293
+ description='Additional delay between API requests to reduce rate-limit pressure',
2294
+ ge=0.0,
2295
+ )
2296
+ max_retries: int | None = Field(
2297
+ 3,
2298
+ description='Maximum retry attempts for transient API failures and rate limits',
2299
+ ge=0,
2300
+ le=10,
2301
+ )
2302
+
2303
+
2304
+ class Type16(StrEnum):
2305
+ """
2306
+ Filter spaces by space type
2307
+ """
2308
+
2309
+ global_ = 'global'
2310
+ collaboration = 'collaboration'
2311
+ knowledge_base = 'knowledge_base'
2312
+ personal = 'personal'
2313
+ system = 'system'
2314
+ onboarding = 'onboarding'
2315
+ xflow_sample_space = 'xflow_sample_space'
2316
+
2317
+
2318
+ class Status(StrEnum):
2319
+ """
2320
+ Filter spaces by status
2321
+ """
2322
+
2323
+ current = 'current'
2324
+ archived = 'archived'
2325
+
2326
+
2327
+ class ConfluenceOptionalScopeSpaces(BaseModel):
2328
+ """
2329
+ Space-level filters passed to Confluence /spaces endpoint.
2330
+ """
2331
+
2332
+ model_config = ConfigDict(
2333
+ extra='forbid',
2334
+ )
2335
+ ids: list[int] | None = Field(
2336
+ None, description='Filter spaces by IDs (up to 250)', max_length=250
2337
+ )
2338
+ keys: list[str] | None = Field(
2339
+ None, description='Filter spaces by keys (up to 250)', max_length=250
2340
+ )
2341
+ type: Type16 | None = Field(None, description='Filter spaces by space type')
2342
+ status: Status | None = Field(None, description='Filter spaces by status')
2343
+ labels: list[str] | None = Field(
2344
+ None,
2345
+ description='Filter spaces by labels (comma-separated in API request)',
2346
+ max_length=250,
2347
+ )
2348
+
2349
+
2350
+ class ConfluenceOptionalScope(BaseModel):
2351
+ model_config = ConfigDict(
2352
+ extra='forbid',
2353
+ )
2354
+ spaces: ConfluenceOptionalScopeSpaces | None = None
2355
+
2356
+
2357
+ class ConfluenceOptionalContent(BaseModel):
2358
+ """
2359
+ Confluence content extraction controls.
2360
+ """
2361
+
2362
+ model_config = ConfigDict(
2363
+ extra='forbid',
2364
+ )
2365
+ include_footer_comments: bool | None = Field(
2366
+ True,
2367
+ description='Include footer comments and aggregate them into a per-page comments asset',
2368
+ )
2369
+ include_inline_comments: bool | None = Field(
2370
+ True,
2371
+ description='Include inline comments and aggregate them into a per-page comments asset',
2372
+ )
2373
+ include_attachments: bool | None = Field(
2374
+ True, description='Include Confluence page attachments as related assets'
2375
+ )
2376
+ include_linked_file_assets: bool | None = Field(
2377
+ True,
2378
+ description='Materialize linked file-like URLs from page body as related assets',
2379
+ )
2380
+ attachment_max_bytes: int | None = Field(
2381
+ 5242880,
2382
+ description='Maximum bytes downloaded per attachment for MIME inference and text extraction',
2383
+ ge=1024,
2384
+ )
2385
+
2386
+
2387
+ class ConfluenceOptional(BaseModel):
2388
+ model_config = ConfigDict(
2389
+ extra='forbid',
2390
+ )
2391
+ connection: ConfluenceOptionalConnection | None = None
2392
+ scope: ConfluenceOptionalScope | None = None
2393
+ content: ConfluenceOptionalContent | None = None
2394
+
2395
+
2396
+ class JiraRequired(BaseModel):
2397
+ model_config = ConfigDict(
2398
+ extra='forbid',
2399
+ )
2400
+ base_url: AnyUrl = Field(
2401
+ ...,
2402
+ description='Jira Cloud tenant URL (for example, https://your-domain.atlassian.net)',
2403
+ )
2404
+ account_email: EmailStr = Field(
2405
+ ...,
2406
+ description='Atlassian account email used with API token for Basic authentication',
2407
+ )
2408
+
2409
+
2410
+ class JiraMasked(BaseModel):
2411
+ model_config = ConfigDict(
2412
+ extra='forbid',
2413
+ )
2414
+ api_token: str = Field(..., description='Atlassian API token for Jira Cloud')
2415
+
2416
+
2417
+ class JiraOptionalConnection(BaseModel):
2418
+ """
2419
+ HTTP and retry settings for Jira API calls.
2420
+ """
2421
+
2422
+ model_config = ConfigDict(
2423
+ extra='forbid',
2424
+ )
2425
+ request_timeout_seconds: float | None = Field(
2426
+ 30, description='HTTP request timeout for Jira API calls', ge=1.0
2427
+ )
2428
+ rate_limit_delay_seconds: float | None = Field(
2429
+ 0,
2430
+ description='Additional delay between API requests to reduce rate-limit pressure',
2431
+ ge=0.0,
2432
+ )
2433
+ max_retries: int | None = Field(
2434
+ 3,
2435
+ description='Maximum retry attempts for transient API failures and rate limits',
2436
+ ge=0,
2437
+ le=10,
2438
+ )
2439
+
2440
+
2441
+ class JiraOptionalScope(BaseModel):
2442
+ """
2443
+ Optional Jira scope filters. When omitted, all visible issues are eligible for sampling.
2444
+ """
2445
+
2446
+ model_config = ConfigDict(
2447
+ extra='forbid',
2448
+ )
2449
+ project_keys: list[str] | None = Field(
2450
+ None,
2451
+ description='Project keys to include (up to 50)',
2452
+ max_length=50,
2453
+ min_length=1,
2454
+ )
2455
+ project_ids: list[int] | None = Field(
2456
+ None,
2457
+ description='Project IDs to include (up to 50)',
2458
+ max_length=50,
2459
+ min_length=1,
2460
+ )
2461
+ jql: str | None = Field(
2462
+ None,
2463
+ description='Additional JQL filter to combine with project scope',
2464
+ min_length=1,
2465
+ )
2466
+
2467
+
2468
+ class JiraOptionalContent(BaseModel):
2469
+ """
2470
+ Jira issue content extraction controls.
2471
+ """
2472
+
2473
+ model_config = ConfigDict(
2474
+ extra='forbid',
2475
+ )
2476
+ include_comments: bool | None = Field(
2477
+ True,
2478
+ description='Include issue comments and aggregate them into a per-issue comments asset',
2479
+ )
2480
+ include_attachments: bool | None = Field(
2481
+ True, description='Include issue attachments as related assets'
2482
+ )
2483
+ attachment_max_bytes: int | None = Field(
2484
+ 5242880,
2485
+ description='Maximum bytes downloaded per attachment for MIME inference and text extraction',
2486
+ ge=1024,
2487
+ )
2488
+
2489
+
2490
+ class JiraOptional(BaseModel):
2491
+ model_config = ConfigDict(
2492
+ extra='forbid',
2493
+ )
2494
+ connection: JiraOptionalConnection | None = None
2495
+ scope: JiraOptionalScope | None = None
2496
+ content: JiraOptionalContent | None = None
2497
+
2498
+
2499
+ class ServiceDeskRequired(BaseModel):
2500
+ model_config = ConfigDict(
2501
+ extra='forbid',
2502
+ )
2503
+ base_url: AnyUrl = Field(
2504
+ ...,
2505
+ description='Jira Service Management tenant URL (for example, https://your-domain.atlassian.net)',
2506
+ )
2507
+ account_email: EmailStr = Field(
2508
+ ...,
2509
+ description='Atlassian account email used with API token for Basic authentication',
2510
+ )
2511
+
2512
+
2513
+ class ServiceDeskMasked(BaseModel):
2514
+ model_config = ConfigDict(
2515
+ extra='forbid',
2516
+ )
2517
+ api_token: str = Field(
2518
+ ..., description='Atlassian API token for Jira Service Management Cloud'
2519
+ )
2520
+
2521
+
2522
+ class ServiceDeskOptionalConnection(BaseModel):
2523
+ """
2524
+ HTTP and retry settings for Jira Service Management API calls.
2525
+ """
2526
+
2527
+ model_config = ConfigDict(
2528
+ extra='forbid',
2529
+ )
2530
+ request_timeout_seconds: float | None = Field(
2531
+ 30, description='HTTP request timeout for Service Desk API calls', ge=1.0
2532
+ )
2533
+ rate_limit_delay_seconds: float | None = Field(
2534
+ 0,
2535
+ description='Additional delay between API requests to reduce rate-limit pressure',
2536
+ ge=0.0,
2537
+ )
2538
+ max_retries: int | None = Field(
2539
+ 3,
2540
+ description='Maximum retry attempts for transient API failures and rate limits',
2541
+ ge=0,
2542
+ le=10,
2543
+ )
2544
+
2545
+
2546
+ class ServiceDeskOptionalScope(BaseModel):
2547
+ """
2548
+ Optional Service Desk scope filters. When omitted, all visible requests are eligible for sampling.
2549
+ """
2550
+
2551
+ model_config = ConfigDict(
2552
+ extra='forbid',
2553
+ )
2554
+ service_desk_ids: list[int] | None = Field(
2555
+ None, description='Service desk IDs to include', max_length=100, min_length=1
2556
+ )
2557
+ request_type_ids: list[int] | None = Field(
2558
+ None, description='Request type IDs to include', max_length=100, min_length=1
2559
+ )
2560
+ request_status: str | None = Field(
2561
+ None, description='Request status filter passed to Service Desk API'
2562
+ )
2563
+ request_ownership: list[str] | None = Field(
2564
+ None,
2565
+ description='Ownership filter values passed to Service Desk API (for example, OWNED_REQUESTS)',
2566
+ min_length=1,
2567
+ )
2568
+ organization_id: int | None = Field(
2569
+ None, description='Organization ID used to scope requests'
2570
+ )
2571
+ search_term: str | None = Field(
2572
+ None, description='Search term filter for request summaries and content'
2573
+ )
2574
+
2575
+
2576
+ class ServiceDeskOptionalContent(BaseModel):
2577
+ """
2578
+ Service Desk request extraction controls.
2579
+ """
2580
+
2581
+ model_config = ConfigDict(
2582
+ extra='forbid',
2583
+ )
2584
+ include_comments: bool | None = Field(
2585
+ True,
2586
+ description='Include request comments and aggregate them into a per-request comments asset',
2587
+ )
2588
+ include_attachments: bool | None = Field(
2589
+ True, description='Include request attachments as related assets'
2590
+ )
2591
+ attachment_max_bytes: int | None = Field(
2592
+ 5242880,
2593
+ description='Maximum bytes downloaded per attachment for MIME inference and text extraction',
2594
+ ge=1024,
2595
+ )
2596
+
2597
+
2598
+ class ServiceDeskOptional(BaseModel):
2599
+ model_config = ConfigDict(
2600
+ extra='forbid',
2601
+ )
2602
+ connection: ServiceDeskOptionalConnection | None = None
2603
+ scope: ServiceDeskOptionalScope | None = None
2604
+ content: ServiceDeskOptionalContent | None = None
2605
+
2606
+
2607
+ class Type17(StrEnum):
2608
+ """
2609
+ Type of the asset or source
2610
+ """
2611
+
2612
+ WORDPRESS = 'WORDPRESS'
2613
+ SLACK = 'SLACK'
2614
+ S3_COMPATIBLE_STORAGE = 'S3_COMPATIBLE_STORAGE'
2615
+ AZURE_BLOB_STORAGE = 'AZURE_BLOB_STORAGE'
2616
+ GOOGLE_CLOUD_STORAGE = 'GOOGLE_CLOUD_STORAGE'
2617
+ POSTGRESQL = 'POSTGRESQL'
2618
+ MYSQL = 'MYSQL'
2619
+ MSSQL = 'MSSQL'
2620
+ ORACLE = 'ORACLE'
2621
+ HIVE = 'HIVE'
2622
+ DATABRICKS = 'DATABRICKS'
2623
+ SNOWFLAKE = 'SNOWFLAKE'
2624
+ MONGODB = 'MONGODB'
2625
+ NEO4J = 'NEO4J'
2626
+ POWERBI = 'POWERBI'
2627
+ TABLEAU = 'TABLEAU'
2628
+ CONFLUENCE = 'CONFLUENCE'
2629
+ JIRA = 'JIRA'
2630
+ SERVICEDESK = 'SERVICEDESK'
2631
+
2632
+
2633
+ class ConfluenceInput(CoreInput):
2634
+ type: Literal['CONFLUENCE'] = Field(
2635
+ 'CONFLUENCE', description='Type of the asset or source'
2636
+ )
2637
+ required: ConfluenceRequired
2638
+ masked: ConfluenceMasked
2639
+ optional: ConfluenceOptional | None = None
2640
+ detectors: list[Detector] | None = Field(
2641
+ None, description='Detectors to run on ingested content'
2642
+ )
2643
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2644
+ None,
2645
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2646
+ )
2647
+ sampling: SamplingConfig
2648
+ resources: ResourceOverrides | None = None
2649
+
2650
+
2651
+ class JiraInput(CoreInput):
2652
+ type: Literal['JIRA'] = Field('JIRA', description='Type of the asset or source')
2653
+ required: JiraRequired
2654
+ masked: JiraMasked
2655
+ optional: JiraOptional | None = None
2656
+ detectors: list[Detector] | None = Field(
2657
+ None, description='Detectors to run on ingested content'
2658
+ )
2659
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2660
+ None,
2661
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2662
+ )
2663
+ sampling: SamplingConfig
2664
+ resources: ResourceOverrides | None = None
2665
+
2666
+
2667
+ class ServiceDeskInput(CoreInput):
2668
+ type: Literal['SERVICEDESK'] = Field(
2669
+ 'SERVICEDESK', description='Type of the asset or source'
2670
+ )
2671
+ required: ServiceDeskRequired
2672
+ masked: ServiceDeskMasked
2673
+ optional: ServiceDeskOptional | None = None
2674
+ detectors: list[Detector] | None = Field(
2675
+ None, description='Detectors to run on ingested content'
2676
+ )
2677
+ custom_detectors: list[CustomDetectorSelection] | None = Field(
2678
+ None,
2679
+ description='Reusable custom detector IDs selected from the custom detector catalog.',
2680
+ )
2681
+ sampling: SamplingConfig
2682
+ resources: ResourceOverrides | None = None
2683
+
2684
+
2685
+ class SourceInput(
2686
+ RootModel[
2687
+ SlackInput
2688
+ | S3CompatibleStorageInput
2689
+ | AzureBlobStorageInput
2690
+ | GoogleCloudStorageInput
2691
+ | PostgreSQLInput
2692
+ | MySQLInput
2693
+ | MSSQLInput
2694
+ | OracleInput
2695
+ | HiveInput
2696
+ | DatabricksInput
2697
+ | SnowflakeInput
2698
+ | MongoDBInput
2699
+ | Neo4jInput
2700
+ | PowerBIInput
2701
+ | TableauInput
2702
+ | WordPressInput
2703
+ | ConfluenceInput
2704
+ | JiraInput
2705
+ | ServiceDeskInput
2706
+ ]
2707
+ ):
2708
+ root: (
2709
+ SlackInput
2710
+ | S3CompatibleStorageInput
2711
+ | AzureBlobStorageInput
2712
+ | GoogleCloudStorageInput
2713
+ | PostgreSQLInput
2714
+ | MySQLInput
2715
+ | MSSQLInput
2716
+ | OracleInput
2717
+ | HiveInput
2718
+ | DatabricksInput
2719
+ | SnowflakeInput
2720
+ | MongoDBInput
2721
+ | Neo4jInput
2722
+ | PowerBIInput
2723
+ | TableauInput
2724
+ | WordPressInput
2725
+ | ConfluenceInput
2726
+ | JiraInput
2727
+ | ServiceDeskInput
2728
+ ) = Field(
2729
+ ...,
2730
+ description='Merged configuration schema with all source types and common definitions',
2731
+ title='SourceInput',
2732
+ )