classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ..models.generated_single_asset_scan_results import Location
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class TabularCellMatch:
|
|
13
|
+
row_index: int
|
|
14
|
+
column_name: str
|
|
15
|
+
row: dict[str, str]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def format_tabular_sample_content(
|
|
19
|
+
*,
|
|
20
|
+
scope_label: str,
|
|
21
|
+
scope_value: str,
|
|
22
|
+
strategy: Any,
|
|
23
|
+
rows: list[tuple[Any, ...]],
|
|
24
|
+
column_names: list[str],
|
|
25
|
+
serialize_cell: Any,
|
|
26
|
+
include_column_names: bool,
|
|
27
|
+
object_type: str | None = None,
|
|
28
|
+
raw_metadata: dict[str, Any] | None = None,
|
|
29
|
+
row_offset: int = 0,
|
|
30
|
+
) -> tuple[str, str]:
|
|
31
|
+
lines = [
|
|
32
|
+
f"{scope_label}={scope_value}",
|
|
33
|
+
]
|
|
34
|
+
if object_type:
|
|
35
|
+
lines.append(f"object_type={object_type}")
|
|
36
|
+
lines.extend(
|
|
37
|
+
[
|
|
38
|
+
f"sampling_strategy={strategy}",
|
|
39
|
+
f"sampled_rows={len(rows)}",
|
|
40
|
+
"",
|
|
41
|
+
]
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
serialized_rows: list[dict[str, str]] = []
|
|
45
|
+
for index, row in enumerate(rows, start=1 + row_offset):
|
|
46
|
+
serialized_row: dict[str, str] = {}
|
|
47
|
+
lines.append(f"row_{index}:")
|
|
48
|
+
for column_name, cell in zip(column_names, row, strict=False):
|
|
49
|
+
serialized = str(serialize_cell(cell))
|
|
50
|
+
serialized_row[column_name] = serialized
|
|
51
|
+
if include_column_names:
|
|
52
|
+
rendered_lines = serialized.splitlines() or [""]
|
|
53
|
+
first_line, *continuation_lines = rendered_lines
|
|
54
|
+
lines.append(f" {column_name}: {first_line}")
|
|
55
|
+
for continuation_line in continuation_lines:
|
|
56
|
+
lines.append(f" {continuation_line}")
|
|
57
|
+
else:
|
|
58
|
+
lines.append(f" {serialized}")
|
|
59
|
+
lines.append("")
|
|
60
|
+
serialized_rows.append(serialized_row)
|
|
61
|
+
|
|
62
|
+
raw_payload = dict(raw_metadata or {})
|
|
63
|
+
raw_payload["strategy"] = str(strategy)
|
|
64
|
+
raw_payload["rows"] = serialized_rows
|
|
65
|
+
raw_payload["row_offset"] = row_offset
|
|
66
|
+
if object_type:
|
|
67
|
+
raw_payload["object_type"] = object_type
|
|
68
|
+
|
|
69
|
+
text_content = "\n".join(lines).rstrip()
|
|
70
|
+
return json.dumps(raw_payload, ensure_ascii=False), text_content
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def build_tabular_location(
|
|
74
|
+
*,
|
|
75
|
+
raw_content: str | None,
|
|
76
|
+
matched_content: str,
|
|
77
|
+
base_path: str,
|
|
78
|
+
primary_key_columns: list[str] | None = None,
|
|
79
|
+
row_index: int | None = None,
|
|
80
|
+
column_name: str | None = None,
|
|
81
|
+
) -> Location:
|
|
82
|
+
match = _find_tabular_cell_match(
|
|
83
|
+
raw_content,
|
|
84
|
+
matched_content,
|
|
85
|
+
row_index=row_index,
|
|
86
|
+
column_name=column_name,
|
|
87
|
+
)
|
|
88
|
+
if match is None:
|
|
89
|
+
return Location(path=base_path)
|
|
90
|
+
|
|
91
|
+
path = base_path
|
|
92
|
+
pk_columns = primary_key_columns or []
|
|
93
|
+
pk_parts = [f"{column}={match.row[column]}" for column in pk_columns if column in match.row]
|
|
94
|
+
if pk_parts:
|
|
95
|
+
path += f", {', '.join(pk_parts)}"
|
|
96
|
+
else:
|
|
97
|
+
path += f", row {match.row_index}"
|
|
98
|
+
|
|
99
|
+
return Location(path=path, description=f"column {match.column_name}")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _find_tabular_cell_match(
|
|
103
|
+
raw_content: str | None,
|
|
104
|
+
matched_content: str,
|
|
105
|
+
*,
|
|
106
|
+
row_index: int | None = None,
|
|
107
|
+
column_name: str | None = None,
|
|
108
|
+
) -> TabularCellMatch | None:
|
|
109
|
+
if not raw_content or not matched_content:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
payload = json.loads(raw_content)
|
|
114
|
+
except (TypeError, json.JSONDecodeError):
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
rows = payload.get("rows")
|
|
118
|
+
if not isinstance(rows, list):
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
row_offset = payload.get("row_offset", 0)
|
|
122
|
+
normalized_match = _normalize_for_match(matched_content)
|
|
123
|
+
substring_match: TabularCellMatch | None = None
|
|
124
|
+
normalized_substring_match: TabularCellMatch | None = None
|
|
125
|
+
for current_row_index, raw_row in enumerate(rows, start=1 + row_offset):
|
|
126
|
+
if row_index is not None and current_row_index != row_index:
|
|
127
|
+
continue
|
|
128
|
+
if not isinstance(raw_row, dict):
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
row = {str(key): "" if value is None else str(value) for key, value in raw_row.items()}
|
|
132
|
+
for current_column_name, value in row.items():
|
|
133
|
+
if column_name is not None and current_column_name != column_name:
|
|
134
|
+
continue
|
|
135
|
+
if value == matched_content:
|
|
136
|
+
return TabularCellMatch(
|
|
137
|
+
row_index=current_row_index,
|
|
138
|
+
column_name=current_column_name,
|
|
139
|
+
row=row,
|
|
140
|
+
)
|
|
141
|
+
if substring_match is None and matched_content in value:
|
|
142
|
+
substring_match = TabularCellMatch(
|
|
143
|
+
row_index=current_row_index,
|
|
144
|
+
column_name=current_column_name,
|
|
145
|
+
row=row,
|
|
146
|
+
)
|
|
147
|
+
normalized_value = _normalize_for_match(value)
|
|
148
|
+
if normalized_value == normalized_match:
|
|
149
|
+
return TabularCellMatch(
|
|
150
|
+
row_index=current_row_index,
|
|
151
|
+
column_name=current_column_name,
|
|
152
|
+
row=row,
|
|
153
|
+
)
|
|
154
|
+
if normalized_substring_match is None and normalized_match in normalized_value:
|
|
155
|
+
normalized_substring_match = TabularCellMatch(
|
|
156
|
+
row_index=current_row_index,
|
|
157
|
+
column_name=current_column_name,
|
|
158
|
+
row=row,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return substring_match or normalized_substring_match
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _normalize_for_match(value: str) -> str:
|
|
165
|
+
return re.sub(r"\s+", " ", value).strip()
|