classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,165 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import re
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ from ..models.generated_single_asset_scan_results import Location
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class TabularCellMatch:
13
+ row_index: int
14
+ column_name: str
15
+ row: dict[str, str]
16
+
17
+
18
+ def format_tabular_sample_content(
19
+ *,
20
+ scope_label: str,
21
+ scope_value: str,
22
+ strategy: Any,
23
+ rows: list[tuple[Any, ...]],
24
+ column_names: list[str],
25
+ serialize_cell: Any,
26
+ include_column_names: bool,
27
+ object_type: str | None = None,
28
+ raw_metadata: dict[str, Any] | None = None,
29
+ row_offset: int = 0,
30
+ ) -> tuple[str, str]:
31
+ lines = [
32
+ f"{scope_label}={scope_value}",
33
+ ]
34
+ if object_type:
35
+ lines.append(f"object_type={object_type}")
36
+ lines.extend(
37
+ [
38
+ f"sampling_strategy={strategy}",
39
+ f"sampled_rows={len(rows)}",
40
+ "",
41
+ ]
42
+ )
43
+
44
+ serialized_rows: list[dict[str, str]] = []
45
+ for index, row in enumerate(rows, start=1 + row_offset):
46
+ serialized_row: dict[str, str] = {}
47
+ lines.append(f"row_{index}:")
48
+ for column_name, cell in zip(column_names, row, strict=False):
49
+ serialized = str(serialize_cell(cell))
50
+ serialized_row[column_name] = serialized
51
+ if include_column_names:
52
+ rendered_lines = serialized.splitlines() or [""]
53
+ first_line, *continuation_lines = rendered_lines
54
+ lines.append(f" {column_name}: {first_line}")
55
+ for continuation_line in continuation_lines:
56
+ lines.append(f" {continuation_line}")
57
+ else:
58
+ lines.append(f" {serialized}")
59
+ lines.append("")
60
+ serialized_rows.append(serialized_row)
61
+
62
+ raw_payload = dict(raw_metadata or {})
63
+ raw_payload["strategy"] = str(strategy)
64
+ raw_payload["rows"] = serialized_rows
65
+ raw_payload["row_offset"] = row_offset
66
+ if object_type:
67
+ raw_payload["object_type"] = object_type
68
+
69
+ text_content = "\n".join(lines).rstrip()
70
+ return json.dumps(raw_payload, ensure_ascii=False), text_content
71
+
72
+
73
+ def build_tabular_location(
74
+ *,
75
+ raw_content: str | None,
76
+ matched_content: str,
77
+ base_path: str,
78
+ primary_key_columns: list[str] | None = None,
79
+ row_index: int | None = None,
80
+ column_name: str | None = None,
81
+ ) -> Location:
82
+ match = _find_tabular_cell_match(
83
+ raw_content,
84
+ matched_content,
85
+ row_index=row_index,
86
+ column_name=column_name,
87
+ )
88
+ if match is None:
89
+ return Location(path=base_path)
90
+
91
+ path = base_path
92
+ pk_columns = primary_key_columns or []
93
+ pk_parts = [f"{column}={match.row[column]}" for column in pk_columns if column in match.row]
94
+ if pk_parts:
95
+ path += f", {', '.join(pk_parts)}"
96
+ else:
97
+ path += f", row {match.row_index}"
98
+
99
+ return Location(path=path, description=f"column {match.column_name}")
100
+
101
+
102
+ def _find_tabular_cell_match(
103
+ raw_content: str | None,
104
+ matched_content: str,
105
+ *,
106
+ row_index: int | None = None,
107
+ column_name: str | None = None,
108
+ ) -> TabularCellMatch | None:
109
+ if not raw_content or not matched_content:
110
+ return None
111
+
112
+ try:
113
+ payload = json.loads(raw_content)
114
+ except (TypeError, json.JSONDecodeError):
115
+ return None
116
+
117
+ rows = payload.get("rows")
118
+ if not isinstance(rows, list):
119
+ return None
120
+
121
+ row_offset = payload.get("row_offset", 0)
122
+ normalized_match = _normalize_for_match(matched_content)
123
+ substring_match: TabularCellMatch | None = None
124
+ normalized_substring_match: TabularCellMatch | None = None
125
+ for current_row_index, raw_row in enumerate(rows, start=1 + row_offset):
126
+ if row_index is not None and current_row_index != row_index:
127
+ continue
128
+ if not isinstance(raw_row, dict):
129
+ continue
130
+
131
+ row = {str(key): "" if value is None else str(value) for key, value in raw_row.items()}
132
+ for current_column_name, value in row.items():
133
+ if column_name is not None and current_column_name != column_name:
134
+ continue
135
+ if value == matched_content:
136
+ return TabularCellMatch(
137
+ row_index=current_row_index,
138
+ column_name=current_column_name,
139
+ row=row,
140
+ )
141
+ if substring_match is None and matched_content in value:
142
+ substring_match = TabularCellMatch(
143
+ row_index=current_row_index,
144
+ column_name=current_column_name,
145
+ row=row,
146
+ )
147
+ normalized_value = _normalize_for_match(value)
148
+ if normalized_value == normalized_match:
149
+ return TabularCellMatch(
150
+ row_index=current_row_index,
151
+ column_name=current_column_name,
152
+ row=row,
153
+ )
154
+ if normalized_substring_match is None and normalized_match in normalized_value:
155
+ normalized_substring_match = TabularCellMatch(
156
+ row_index=current_row_index,
157
+ column_name=current_column_name,
158
+ row=row,
159
+ )
160
+
161
+ return substring_match or normalized_substring_match
162
+
163
+
164
+ def _normalize_for_match(value: str) -> str:
165
+ return re.sub(r"\s+", " ", value).strip()
@@ -0,0 +1,3 @@
1
+ from .source import WordPressSource
2
+
3
+ __all__ = ["WordPressSource"]