classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,240 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: single_asset_scan_results.json
3
+
4
+ from __future__ import annotations
5
+
6
+ from enum import StrEnum
7
+ from typing import Any
8
+
9
+ from pydantic import AwareDatetime, BaseModel, Field
10
+
11
+
12
+ class AssetType(StrEnum):
13
+ """
14
+ Canonical type of the asset payload
15
+ """
16
+
17
+ TXT = 'TXT'
18
+ TABLE = 'TABLE'
19
+ IMAGE = 'IMAGE'
20
+ VIDEO = 'VIDEO'
21
+ AUDIO = 'AUDIO'
22
+ URL = 'URL'
23
+ BINARY = 'BINARY'
24
+ OTHER = 'OTHER'
25
+
26
+
27
+ class DetectorType(StrEnum):
28
+ """
29
+ Type of detector for content analysis
30
+ """
31
+
32
+ SECRETS = 'SECRETS'
33
+ PII = 'PII'
34
+ YARA = 'YARA'
35
+ BROKEN_LINKS = 'BROKEN_LINKS'
36
+ CODE_SECURITY = 'CODE_SECURITY'
37
+ CUSTOM = 'CUSTOM'
38
+
39
+
40
+ class FindingCategory(StrEnum):
41
+ """
42
+ Normalized finding category for reporting and filtering
43
+ """
44
+
45
+ SECURITY = 'SECURITY'
46
+ PRIVACY = 'PRIVACY'
47
+ THREAT = 'THREAT'
48
+ CONTENT = 'CONTENT'
49
+ QUALITY = 'QUALITY'
50
+ FAIRNESS = 'FAIRNESS'
51
+ COMPLIANCE = 'COMPLIANCE'
52
+ SECRETS = 'SECRETS'
53
+ PII = 'PII'
54
+ CLASSIFICATION = 'CLASSIFICATION'
55
+
56
+
57
+ class Severity(StrEnum):
58
+ """
59
+ Severity level of finding
60
+ """
61
+
62
+ critical = 'critical'
63
+ high = 'high'
64
+ medium = 'medium'
65
+ low = 'low'
66
+ info = 'info'
67
+
68
+
69
+ class Location(BaseModel):
70
+ """
71
+ Location of finding in source content
72
+ """
73
+
74
+ path: str | None = Field(
75
+ None,
76
+ description="Human-readable source reference: 'schema.table, row N' for tabular, URL for web/Slack",
77
+ title='Path',
78
+ )
79
+ description: str | None = Field(
80
+ None,
81
+ description='Additional detail, e.g. column name where value was found',
82
+ title='Description',
83
+ )
84
+ line: int | None = Field(None, description='Line number (1-indexed)')
85
+ column: int | None = Field(None, description='Column number (1-indexed)')
86
+ start: int | None = Field(None, description='Start offset (0-indexed)')
87
+ end: int | None = Field(None, description='End offset (0-indexed)')
88
+
89
+
90
+ class ScanStats(BaseModel):
91
+ """
92
+ Statistics about detector scan for an asset
93
+ """
94
+
95
+ scanned_at: AwareDatetime = Field(
96
+ ..., description='Timestamp when the scan started'
97
+ )
98
+ duration_ms: int = Field(..., description='Duration of the scan in milliseconds')
99
+ detectors_run: list[DetectorType] = Field(
100
+ ..., description='List of detector types that were run'
101
+ )
102
+ content_size_bytes: int | None = Field(
103
+ None, description='Size of the content that was scanned'
104
+ )
105
+ findings_count: int | None = Field(
106
+ None, description='Total number of findings detected'
107
+ )
108
+ warnings: list[str] | None = Field(
109
+ None,
110
+ description='Non-fatal issues during scan (e.g. content truncation, empty content)',
111
+ )
112
+ errors: list[str] | None = Field(None, description='Detector errors during scan')
113
+
114
+
115
+ class DetectionResult(BaseModel):
116
+ """
117
+ Result from detector scan
118
+ """
119
+
120
+ detector_type: DetectorType = Field(
121
+ ..., description='Type of detector that found this', title='Detector Type'
122
+ )
123
+ finding_type: str = Field(
124
+ ...,
125
+ description="Type of finding (e.g., 'aws_key', 'ssn', 'toxicity')",
126
+ title='Finding Type',
127
+ )
128
+ category: FindingCategory | str = Field(
129
+ ...,
130
+ description='Category of finding (normalized category preferred, string allowed for compatibility)',
131
+ title='Category',
132
+ )
133
+ severity: Severity
134
+ confidence: float = Field(
135
+ ..., description='Confidence score (0-1)', ge=0.0, le=1.0, title='Confidence'
136
+ )
137
+ matched_content: str = Field(
138
+ ..., description='The content that matched', title='Matched Content'
139
+ )
140
+ redacted_content: str | None = Field(
141
+ None,
142
+ description='Redacted version of matched content',
143
+ title='Redacted Content',
144
+ )
145
+ location: Location | None = Field(
146
+ None, description='Location of finding in content'
147
+ )
148
+ context_before: str | None = Field(
149
+ None, description='Text before the match', title='Context Before'
150
+ )
151
+ context_after: str | None = Field(
152
+ None, description='Text after the match', title='Context After'
153
+ )
154
+ runner_id: str | None = Field(
155
+ None,
156
+ description='ID of the runner that detected this finding',
157
+ title='Runner Id',
158
+ )
159
+ custom_detector_id: str | None = Field(
160
+ None,
161
+ description='Database ID of custom detector instance when detector_type is CUSTOM',
162
+ title='Custom Detector Id',
163
+ )
164
+ custom_detector_key: str | None = Field(
165
+ None,
166
+ description='Stable key of custom detector instance when detector_type is CUSTOM',
167
+ title='Custom Detector Key',
168
+ )
169
+ custom_detector_name: str | None = Field(
170
+ None,
171
+ description='Display name of custom detector instance when detector_type is CUSTOM',
172
+ title='Custom Detector Name',
173
+ )
174
+ detected_at: AwareDatetime | None = Field(
175
+ None,
176
+ description='Timestamp when this finding was detected',
177
+ title='Detected At',
178
+ )
179
+ metadata: dict[str, Any] | None = Field(
180
+ None, description='Additional detector-specific metadata', title='Metadata'
181
+ )
182
+ extracted_data: dict[str, Any] | None = Field(
183
+ None,
184
+ description='Structured field values extracted from matched content',
185
+ title='Extracted Data',
186
+ )
187
+ extraction_method: str | None = Field(
188
+ None,
189
+ description='Which extraction strategy was used: REGEX, GLINER, CLASSIFIER_GLINER',
190
+ title='Extraction Method',
191
+ )
192
+
193
+
194
+ class SingleAssetScanResults(BaseModel):
195
+ """
196
+ Single asset scan results with detector findings
197
+ """
198
+
199
+ hash: str = Field(..., description='Unique stable hash of the asset', title='Hash')
200
+ checksum: str = Field(
201
+ ...,
202
+ description='SHA-256 checksum of the asset metadata to detect changes',
203
+ title='Checksum',
204
+ )
205
+ name: str = Field(..., description='Name of the asset', title='Name')
206
+ external_url: str = Field(
207
+ ..., description='External URL of the asset', title='External Url'
208
+ )
209
+ links: list[str] = Field(
210
+ ..., description='Linked asset hashes referenced by this asset', title='Links'
211
+ )
212
+ asset_type: AssetType = Field(
213
+ ..., description='Canonical asset content type', title='Asset Type'
214
+ )
215
+ source_id: str | None = Field(
216
+ None,
217
+ description='ID of the source this asset belongs to (optional for local runs)',
218
+ title='Source Id',
219
+ )
220
+ created_at: AwareDatetime = Field(
221
+ ..., description='The date and time the asset was created', title='Created At'
222
+ )
223
+ updated_at: AwareDatetime = Field(
224
+ ...,
225
+ description='The date and time the asset was last updated',
226
+ title='Updated At',
227
+ )
228
+ runner_id: str | None = Field(
229
+ None,
230
+ description='ID of the runner that produced this asset (optional for local runs)',
231
+ title='Runner Id',
232
+ )
233
+ findings: list[DetectionResult] | None = Field(
234
+ None, description='Detector findings for this asset', title='Findings'
235
+ )
236
+ scan_stats: ScanStats | None = Field(
237
+ None,
238
+ description='Statistics about the detector scan for this asset',
239
+ title='Scan Stats',
240
+ )
@@ -0,0 +1,3 @@
1
+ from .factory import create_output_sink, resolve_output_settings
2
+
3
+ __all__ = ["create_output_sink", "resolve_output_settings"]
src/outputs/base.py ADDED
@@ -0,0 +1,69 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Literal, Protocol
5
+
6
+ from pydantic import BaseModel
7
+
8
+ OutputType = Literal["rest", "file", "console"]
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class OutputRuntimeContext:
13
+ source_id: str | None
14
+ runner_id: str | None
15
+ managed_runner: bool
16
+ batch_size: int
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class OutputSettings:
21
+ output_type: OutputType
22
+ batch_size: int
23
+ source_id: str | None
24
+ runner_id: str | None
25
+ managed_runner: bool
26
+ rest_url: str | None = None
27
+ rest_timeout_sec: int = 30
28
+ file_path: str | None = None
29
+
30
+
31
+ class BatchEnvelope(BaseModel):
32
+ event: Literal["batch"] = "batch"
33
+ output_type: OutputType
34
+ source_id: str | None = None
35
+ runner_id: str | None = None
36
+ batch_index: int
37
+ asset_count: int
38
+ assets: list[dict[str, Any]]
39
+
40
+
41
+ class FinishEnvelope(BaseModel):
42
+ event: Literal["finish"] = "finish"
43
+ output_type: OutputType
44
+ source_id: str | None = None
45
+ runner_id: str | None = None
46
+ batch_count: int
47
+ total_assets: int
48
+
49
+
50
+ class ErrorEnvelope(BaseModel):
51
+ event: Literal["error"] = "error"
52
+ output_type: OutputType
53
+ source_id: str | None = None
54
+ runner_id: str | None = None
55
+ error: str
56
+
57
+
58
+ class OutputSink(Protocol):
59
+ batch_size: int
60
+
61
+ async def start(self) -> None: ...
62
+
63
+ async def emit_batch(
64
+ self, assets: list[dict[str, Any]], *, skip_findings: bool = False
65
+ ) -> None: ...
66
+
67
+ async def finish(self) -> None: ...
68
+
69
+ async def fail(self, error: Exception) -> None: ...
src/outputs/console.py ADDED
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import Any
5
+
6
+ from .base import (
7
+ BatchEnvelope,
8
+ ErrorEnvelope,
9
+ FinishEnvelope,
10
+ OutputRuntimeContext,
11
+ OutputType,
12
+ )
13
+
14
+
15
+ class ConsoleOutputSink:
16
+ output_type: OutputType = "console"
17
+
18
+ def __init__(self, context: OutputRuntimeContext):
19
+ self.context = context
20
+ self.batch_size = context.batch_size
21
+ self._batch_count = 0
22
+ self._total_assets = 0
23
+
24
+ async def start(self) -> None:
25
+ return None
26
+
27
+ async def emit_batch(
28
+ self, assets: list[dict[str, Any]], *, skip_findings: bool = False
29
+ ) -> None:
30
+ if not assets:
31
+ return
32
+
33
+ self._batch_count += 1
34
+ self._total_assets += len(assets)
35
+ payload = BatchEnvelope(
36
+ output_type=self.output_type,
37
+ source_id=self.context.source_id,
38
+ runner_id=self.context.runner_id,
39
+ batch_index=self._batch_count,
40
+ asset_count=len(assets),
41
+ assets=assets,
42
+ )
43
+ print(json.dumps(payload.model_dump(mode="json")), flush=True)
44
+
45
+ async def finish(self) -> None:
46
+ payload = FinishEnvelope(
47
+ output_type=self.output_type,
48
+ source_id=self.context.source_id,
49
+ runner_id=self.context.runner_id,
50
+ batch_count=self._batch_count,
51
+ total_assets=self._total_assets,
52
+ )
53
+ print(json.dumps(payload.model_dump(mode="json")), flush=True)
54
+
55
+ async def fail(self, error: Exception) -> None:
56
+ payload = ErrorEnvelope(
57
+ output_type=self.output_type,
58
+ source_id=self.context.source_id,
59
+ runner_id=self.context.runner_id,
60
+ error=str(error),
61
+ )
62
+ print(json.dumps(payload.model_dump(mode="json")), flush=True)
src/outputs/factory.py ADDED
@@ -0,0 +1,156 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import os
5
+ from typing import Any, cast
6
+
7
+ from .base import OutputRuntimeContext, OutputSettings, OutputSink, OutputType
8
+ from .console import ConsoleOutputSink
9
+ from .file import FileOutputSink
10
+ from .rest import RestOutputSink
11
+
12
+
13
+ def _normalize_output_type(value: str) -> OutputType:
14
+ normalized = value.strip().lower()
15
+ if normalized not in {"rest", "file", "console"}:
16
+ raise ValueError("output type must be one of: rest, file, console")
17
+ return cast(OutputType, normalized)
18
+
19
+
20
+ def _parse_int(value: Any, fallback: int) -> int:
21
+ if value is None:
22
+ return fallback
23
+
24
+ if isinstance(value, bool):
25
+ return fallback
26
+
27
+ if isinstance(value, int):
28
+ return value
29
+
30
+ try:
31
+ return int(str(value).strip())
32
+ except (TypeError, ValueError):
33
+ return fallback
34
+
35
+
36
+ def _coalesce(*values: Any) -> Any:
37
+ for value in values:
38
+ if value is None:
39
+ continue
40
+ if isinstance(value, str) and not value.strip():
41
+ continue
42
+ return value
43
+ return None
44
+
45
+
46
+ def resolve_output_settings(
47
+ args: argparse.Namespace,
48
+ ) -> OutputSettings:
49
+ env_type = os.environ.get("CLASSIFYRE_OUTPUT_TYPE")
50
+ env_batch_size = os.environ.get("CLASSIFYRE_OUTPUT_BATCH_SIZE")
51
+ env_rest_url = os.environ.get("CLASSIFYRE_OUTPUT_REST_URL")
52
+ env_rest_timeout = os.environ.get("CLASSIFYRE_OUTPUT_REST_TIMEOUT_SEC")
53
+ env_file_path = os.environ.get("CLASSIFYRE_OUTPUT_FILE_PATH")
54
+ env_api_url = os.environ.get("API_URL")
55
+
56
+ source_id_value = _coalesce(
57
+ getattr(args, "source_id", None),
58
+ os.environ.get("SOURCE_ID"),
59
+ )
60
+ runner_id_value = _coalesce(
61
+ getattr(args, "runner_id", None),
62
+ os.environ.get("RUNNER_ID"),
63
+ )
64
+ source_id = str(source_id_value) if source_id_value is not None else None
65
+ runner_id = str(runner_id_value) if runner_id_value is not None else None
66
+ default_output_type: OutputType = "rest" if source_id else "console"
67
+
68
+ output_type = _normalize_output_type(
69
+ str(
70
+ _coalesce(
71
+ getattr(args, "output_type", None),
72
+ env_type,
73
+ default_output_type,
74
+ )
75
+ )
76
+ )
77
+ batch_size = _parse_int(
78
+ _coalesce(
79
+ getattr(args, "output_batch_size", None),
80
+ env_batch_size,
81
+ 20,
82
+ ),
83
+ fallback=20,
84
+ )
85
+ if batch_size < 1:
86
+ raise ValueError("output_batch_size must be >= 1")
87
+
88
+ managed_runner = bool(getattr(args, "managed_runner", False))
89
+ if managed_runner and output_type != "rest":
90
+ raise ValueError("--managed-runner can only be used with output type 'rest'")
91
+
92
+ rest_url_value = _coalesce(
93
+ getattr(args, "output_rest_url", None),
94
+ env_rest_url,
95
+ env_api_url,
96
+ )
97
+ rest_url = str(rest_url_value) if rest_url_value is not None else None
98
+
99
+ rest_timeout_sec = _parse_int(_coalesce(env_rest_timeout, 30), 30)
100
+ if rest_timeout_sec < 1:
101
+ rest_timeout_sec = 30
102
+
103
+ file_path_value = _coalesce(
104
+ getattr(args, "output_file_path", None),
105
+ env_file_path,
106
+ )
107
+ file_path = str(file_path_value) if file_path_value is not None else None
108
+
109
+ if output_type == "rest":
110
+ if not source_id:
111
+ raise ValueError("REST output requires source_id (--source-id or SOURCE_ID)")
112
+ if not rest_url:
113
+ rest_url = "http://localhost:8000"
114
+ if managed_runner and not runner_id:
115
+ raise ValueError("managed REST output requires runner_id")
116
+ elif output_type == "file" and not file_path:
117
+ raise ValueError(
118
+ "file output requires output_file_path (--output-file-path or CLASSIFYRE_OUTPUT_FILE_PATH)"
119
+ )
120
+
121
+ return OutputSettings(
122
+ output_type=output_type,
123
+ batch_size=batch_size,
124
+ source_id=source_id,
125
+ runner_id=runner_id,
126
+ managed_runner=managed_runner,
127
+ rest_url=rest_url,
128
+ rest_timeout_sec=rest_timeout_sec,
129
+ file_path=file_path,
130
+ )
131
+
132
+
133
+ def create_output_sink(args: argparse.Namespace) -> OutputSink:
134
+ settings = resolve_output_settings(args)
135
+ context = OutputRuntimeContext(
136
+ source_id=settings.source_id,
137
+ runner_id=settings.runner_id,
138
+ managed_runner=settings.managed_runner,
139
+ batch_size=settings.batch_size,
140
+ )
141
+
142
+ if settings.output_type == "rest":
143
+ if not settings.rest_url:
144
+ raise ValueError("rest_url must be provided for REST output")
145
+ return RestOutputSink(
146
+ context,
147
+ base_url=settings.rest_url,
148
+ timeout_sec=settings.rest_timeout_sec,
149
+ )
150
+
151
+ if settings.output_type == "file":
152
+ if not settings.file_path:
153
+ raise ValueError("file_path must be provided for file output")
154
+ return FileOutputSink(context, file_path=settings.file_path)
155
+
156
+ return ConsoleOutputSink(context)
src/outputs/file.py ADDED
@@ -0,0 +1,83 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any, TextIO
6
+
7
+ from .base import (
8
+ BatchEnvelope,
9
+ ErrorEnvelope,
10
+ FinishEnvelope,
11
+ OutputRuntimeContext,
12
+ OutputType,
13
+ )
14
+
15
+
16
+ class FileOutputSink:
17
+ output_type: OutputType = "file"
18
+
19
+ def __init__(self, context: OutputRuntimeContext, file_path: str):
20
+ self.context = context
21
+ self.batch_size = context.batch_size
22
+ self.file_path = Path(file_path)
23
+ self._batch_count = 0
24
+ self._total_assets = 0
25
+ self._handle: TextIO | None = None
26
+
27
+ async def start(self) -> None:
28
+ self.file_path.parent.mkdir(parents=True, exist_ok=True)
29
+ self._handle = self.file_path.open("a", encoding="utf-8")
30
+
31
+ async def emit_batch(
32
+ self, assets: list[dict[str, Any]], *, skip_findings: bool = False
33
+ ) -> None:
34
+ if not assets:
35
+ return
36
+ handle = self._require_handle()
37
+ self._batch_count += 1
38
+ self._total_assets += len(assets)
39
+ payload = BatchEnvelope(
40
+ output_type=self.output_type,
41
+ source_id=self.context.source_id,
42
+ runner_id=self.context.runner_id,
43
+ batch_index=self._batch_count,
44
+ asset_count=len(assets),
45
+ assets=assets,
46
+ )
47
+ handle.write(json.dumps(payload.model_dump(mode="json")))
48
+ handle.write("\n")
49
+ handle.flush()
50
+
51
+ async def finish(self) -> None:
52
+ handle = self._require_handle()
53
+ payload = FinishEnvelope(
54
+ output_type=self.output_type,
55
+ source_id=self.context.source_id,
56
+ runner_id=self.context.runner_id,
57
+ batch_count=self._batch_count,
58
+ total_assets=self._total_assets,
59
+ )
60
+ handle.write(json.dumps(payload.model_dump(mode="json")))
61
+ handle.write("\n")
62
+ handle.flush()
63
+ handle.close()
64
+ self._handle = None
65
+
66
+ async def fail(self, error: Exception) -> None:
67
+ handle = self._require_handle()
68
+ payload = ErrorEnvelope(
69
+ output_type=self.output_type,
70
+ source_id=self.context.source_id,
71
+ runner_id=self.context.runner_id,
72
+ error=str(error),
73
+ )
74
+ handle.write(json.dumps(payload.model_dump(mode="json")))
75
+ handle.write("\n")
76
+ handle.flush()
77
+ handle.close()
78
+ self._handle = None
79
+
80
+ def _require_handle(self) -> TextIO:
81
+ if self._handle is None:
82
+ raise RuntimeError("File output sink was not started before attempting to emit.")
83
+ return self._handle