classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,590 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
from collections.abc import AsyncGenerator, Generator
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
from urllib.parse import urlsplit
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
|
|
11
|
+
from ...models.generated_input import SamplingStrategy, WordPressInput, WordPressOptionalContent
|
|
12
|
+
from ...models.generated_single_asset_scan_results import (
|
|
13
|
+
AssetType as OutputAssetType,
|
|
14
|
+
)
|
|
15
|
+
from ...models.generated_single_asset_scan_results import (
|
|
16
|
+
DetectionResult,
|
|
17
|
+
Location,
|
|
18
|
+
SingleAssetScanResults,
|
|
19
|
+
)
|
|
20
|
+
from ...utils.hashing import hash_url, normalize_http_url, unhash_id
|
|
21
|
+
from ..base import BaseSource
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
HTML_TAG_RE = re.compile("<.*?>")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class WordPressSource(BaseSource):
|
|
28
|
+
source_type = "wordpress"
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
recipe: dict[str, Any],
|
|
33
|
+
source_id: str | None = None,
|
|
34
|
+
runner_id: str | None = None,
|
|
35
|
+
):
|
|
36
|
+
super().__init__(recipe, source_id=source_id, runner_id=runner_id)
|
|
37
|
+
self.config = WordPressInput.model_validate(recipe)
|
|
38
|
+
self.runner_id = runner_id or "local-run"
|
|
39
|
+
|
|
40
|
+
self.site_base_url = str(self.config.required.url).rstrip("/")
|
|
41
|
+
self.api_base = f"{self.site_base_url}/wp-json/wp/v2"
|
|
42
|
+
|
|
43
|
+
self._url_to_wp_id: dict[str, str] = {}
|
|
44
|
+
self._hash_to_url: dict[str, str] = {}
|
|
45
|
+
self._seen_asset_hashes: set[str] = set()
|
|
46
|
+
|
|
47
|
+
self.session = requests.Session()
|
|
48
|
+
|
|
49
|
+
if self.config.masked.username and self.config.masked.application_password:
|
|
50
|
+
self.session.auth = (
|
|
51
|
+
self.config.masked.username,
|
|
52
|
+
self.config.masked.application_password,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
logger.info(f"Initialized WordPress source for {self.config.required.url}")
|
|
56
|
+
|
|
57
|
+
def _content_options(self) -> WordPressOptionalContent:
|
|
58
|
+
if self.config.optional and self.config.optional.content:
|
|
59
|
+
return self.config.optional.content
|
|
60
|
+
return WordPressOptionalContent()
|
|
61
|
+
|
|
62
|
+
def test_connection(self) -> dict[str, Any]:
|
|
63
|
+
"""Test connectivity to WordPress REST API."""
|
|
64
|
+
logger.info(f"Testing connection to WordPress at {self.config.required.url}...")
|
|
65
|
+
|
|
66
|
+
result = {
|
|
67
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
68
|
+
"source_type": self.recipe.get("type"),
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
response = self.session.get(
|
|
73
|
+
f"{self.api_base}/posts", params={"per_page": 1}, timeout=10
|
|
74
|
+
)
|
|
75
|
+
if response.status_code == 200:
|
|
76
|
+
result["status"] = "SUCCESS"
|
|
77
|
+
result["message"] = "Successfully connected to WordPress REST API."
|
|
78
|
+
logger.info("Connection test successful")
|
|
79
|
+
elif response.status_code in (401, 403):
|
|
80
|
+
result["status"] = "SUCCESS"
|
|
81
|
+
result["message"] = (
|
|
82
|
+
"WordPress REST API is reachable, but authentication is required for "
|
|
83
|
+
"private content."
|
|
84
|
+
)
|
|
85
|
+
logger.info("Connection test successful (authentication required)")
|
|
86
|
+
else:
|
|
87
|
+
result["status"] = "FAILURE"
|
|
88
|
+
result["message"] = (
|
|
89
|
+
f"Unexpected status from WordPress REST API: {response.status_code}"
|
|
90
|
+
)
|
|
91
|
+
logger.error(result["message"])
|
|
92
|
+
except requests.exceptions.RequestException as e:
|
|
93
|
+
result["status"] = "FAILURE"
|
|
94
|
+
result["message"] = f"Failed to connect: {e!s}"
|
|
95
|
+
logger.error(f"Connection test failed: {e}")
|
|
96
|
+
|
|
97
|
+
return result
|
|
98
|
+
|
|
99
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
100
|
+
"""Extract posts and pages from WordPress."""
|
|
101
|
+
if self._aborted:
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
logger.info("Extracting metadata from WordPress...")
|
|
105
|
+
|
|
106
|
+
self._url_to_wp_id = {}
|
|
107
|
+
self._hash_to_url = {}
|
|
108
|
+
self._seen_asset_hashes = set()
|
|
109
|
+
|
|
110
|
+
pending_batch: list[SingleAssetScanResults] = []
|
|
111
|
+
content_options = self._content_options()
|
|
112
|
+
sampling = self.config.sampling
|
|
113
|
+
limit: int | None = (
|
|
114
|
+
None
|
|
115
|
+
if sampling.strategy == SamplingStrategy.ALL
|
|
116
|
+
else int(sampling.rows_per_page or 100)
|
|
117
|
+
)
|
|
118
|
+
total_items_extracted = 0
|
|
119
|
+
|
|
120
|
+
if content_options.fetch_posts is not False:
|
|
121
|
+
posts_count = 0
|
|
122
|
+
posts_assets = 0
|
|
123
|
+
for assets_chunk, items_count in self._stream_content_type(
|
|
124
|
+
"posts",
|
|
125
|
+
limit - total_items_extracted if limit else None,
|
|
126
|
+
sampling.strategy,
|
|
127
|
+
):
|
|
128
|
+
posts_count += items_count
|
|
129
|
+
total_items_extracted += items_count
|
|
130
|
+
posts_assets += len(assets_chunk)
|
|
131
|
+
|
|
132
|
+
for asset in assets_chunk:
|
|
133
|
+
pending_batch.append(asset)
|
|
134
|
+
while len(pending_batch) >= self.BATCH_SIZE:
|
|
135
|
+
to_emit = pending_batch[: self.BATCH_SIZE]
|
|
136
|
+
pending_batch = pending_batch[self.BATCH_SIZE :]
|
|
137
|
+
if to_emit:
|
|
138
|
+
yield to_emit
|
|
139
|
+
|
|
140
|
+
logger.info(f"Extracted {posts_count} posts into {posts_assets} assets")
|
|
141
|
+
|
|
142
|
+
if content_options.fetch_pages is not False and (
|
|
143
|
+
not limit or total_items_extracted < limit
|
|
144
|
+
):
|
|
145
|
+
pages_count = 0
|
|
146
|
+
pages_assets = 0
|
|
147
|
+
for assets_chunk, items_count in self._stream_content_type(
|
|
148
|
+
"pages",
|
|
149
|
+
limit - total_items_extracted if limit else None,
|
|
150
|
+
sampling.strategy,
|
|
151
|
+
):
|
|
152
|
+
pages_count += items_count
|
|
153
|
+
total_items_extracted += items_count
|
|
154
|
+
pages_assets += len(assets_chunk)
|
|
155
|
+
|
|
156
|
+
for asset in assets_chunk:
|
|
157
|
+
pending_batch.append(asset)
|
|
158
|
+
while len(pending_batch) >= self.BATCH_SIZE:
|
|
159
|
+
to_emit = pending_batch[: self.BATCH_SIZE]
|
|
160
|
+
pending_batch = pending_batch[self.BATCH_SIZE :]
|
|
161
|
+
if to_emit:
|
|
162
|
+
yield to_emit
|
|
163
|
+
|
|
164
|
+
logger.info(f"Extracted {pages_count} pages into {pages_assets} assets")
|
|
165
|
+
|
|
166
|
+
if pending_batch:
|
|
167
|
+
yield pending_batch
|
|
168
|
+
|
|
169
|
+
logger.info("Total extracted WordPress items: %s", total_items_extracted)
|
|
170
|
+
|
|
171
|
+
def _stream_content_type(
|
|
172
|
+
self,
|
|
173
|
+
content_type: str,
|
|
174
|
+
limit: int | None,
|
|
175
|
+
strategy: SamplingStrategy = SamplingStrategy.LATEST,
|
|
176
|
+
) -> Generator[tuple[list[SingleAssetScanResults], int], None, None]:
|
|
177
|
+
"""Stream transformed assets for a content type while paginating the API."""
|
|
178
|
+
endpoint = f"{self.api_base}/{content_type}"
|
|
179
|
+
items_extracted = 0
|
|
180
|
+
page = 1
|
|
181
|
+
per_page = 100
|
|
182
|
+
|
|
183
|
+
while True:
|
|
184
|
+
if self._aborted or (limit and items_extracted >= limit):
|
|
185
|
+
break
|
|
186
|
+
|
|
187
|
+
current_per_page = per_page
|
|
188
|
+
if limit:
|
|
189
|
+
current_per_page = min(per_page, limit - items_extracted)
|
|
190
|
+
|
|
191
|
+
params: dict[str, Any] = {
|
|
192
|
+
"per_page": current_per_page,
|
|
193
|
+
"page": page,
|
|
194
|
+
"_embed": "author,wp:term",
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
# For LATEST strategy, request newest items first; RANDOM is not directly
|
|
198
|
+
# supported by WP API so we default to modified date ordering
|
|
199
|
+
if strategy == SamplingStrategy.LATEST:
|
|
200
|
+
params["orderby"] = "modified"
|
|
201
|
+
params["order"] = "desc"
|
|
202
|
+
|
|
203
|
+
content_options = self._content_options()
|
|
204
|
+
if content_options.post_status:
|
|
205
|
+
params["status"] = ",".join(content_options.post_status)
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
response = self.session.get(endpoint, params=params, timeout=30)
|
|
209
|
+
response.raise_for_status()
|
|
210
|
+
|
|
211
|
+
items = response.json()
|
|
212
|
+
if not items:
|
|
213
|
+
break
|
|
214
|
+
|
|
215
|
+
total_items = int(response.headers.get("X-WP-Total", 0))
|
|
216
|
+
total_pages = int(response.headers.get("X-WP-TotalPages", 1))
|
|
217
|
+
|
|
218
|
+
logger.info(
|
|
219
|
+
f"Fetching {content_type} page {page}/{total_pages} "
|
|
220
|
+
f"({len(items)} items, total: {total_items})"
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
page_assets: list[SingleAssetScanResults] = []
|
|
224
|
+
page_items_extracted = 0
|
|
225
|
+
for item in items:
|
|
226
|
+
if self._aborted or (limit and items_extracted >= limit):
|
|
227
|
+
break
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
page_asset, image_assets = self._transform_item_to_assets(
|
|
231
|
+
item, content_type
|
|
232
|
+
)
|
|
233
|
+
self._add_asset_if_new(page_assets, page_asset)
|
|
234
|
+
for image_asset in image_assets:
|
|
235
|
+
self._add_asset_if_new(page_assets, image_asset)
|
|
236
|
+
items_extracted += 1
|
|
237
|
+
page_items_extracted += 1
|
|
238
|
+
except Exception as e:
|
|
239
|
+
logger.error(
|
|
240
|
+
f"Failed to transform {content_type} item {item.get('id')}: {e}"
|
|
241
|
+
)
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
if page_items_extracted > 0:
|
|
245
|
+
yield page_assets, page_items_extracted
|
|
246
|
+
|
|
247
|
+
if page >= total_pages or len(items) < current_per_page:
|
|
248
|
+
break
|
|
249
|
+
|
|
250
|
+
page += 1
|
|
251
|
+
|
|
252
|
+
except requests.exceptions.RequestException as e:
|
|
253
|
+
logger.error(f"Failed to fetch {content_type} page {page}: {e}")
|
|
254
|
+
break
|
|
255
|
+
|
|
256
|
+
def _fetch_content_type(
|
|
257
|
+
self, content_type: str, limit: int | None
|
|
258
|
+
) -> tuple[list[SingleAssetScanResults], int]:
|
|
259
|
+
"""Compatibility helper used by tests; collects stream into memory."""
|
|
260
|
+
results: list[SingleAssetScanResults] = []
|
|
261
|
+
items_extracted = 0
|
|
262
|
+
|
|
263
|
+
for assets, extracted_count in self._stream_content_type(
|
|
264
|
+
content_type, limit, self.config.sampling.strategy
|
|
265
|
+
):
|
|
266
|
+
results.extend(assets)
|
|
267
|
+
items_extracted += extracted_count
|
|
268
|
+
return results, items_extracted
|
|
269
|
+
|
|
270
|
+
def _add_asset_if_new(
|
|
271
|
+
self, results: list[SingleAssetScanResults], asset: SingleAssetScanResults
|
|
272
|
+
) -> None:
|
|
273
|
+
if asset.hash in self._seen_asset_hashes:
|
|
274
|
+
return
|
|
275
|
+
self._seen_asset_hashes.add(asset.hash)
|
|
276
|
+
results.append(asset)
|
|
277
|
+
|
|
278
|
+
def _parse_wordpress_date(self, date_str: str | None) -> str | None:
|
|
279
|
+
"""Parse WordPress date and ensure it has timezone info."""
|
|
280
|
+
if not date_str:
|
|
281
|
+
return None
|
|
282
|
+
|
|
283
|
+
if "+" in date_str or date_str.endswith("Z"):
|
|
284
|
+
return date_str
|
|
285
|
+
|
|
286
|
+
return f"{date_str}+00:00"
|
|
287
|
+
|
|
288
|
+
def _parse_datetime(self, date_str: str | None) -> datetime:
|
|
289
|
+
if not date_str:
|
|
290
|
+
return datetime.now(UTC)
|
|
291
|
+
normalized = date_str.replace("Z", "+00:00")
|
|
292
|
+
try:
|
|
293
|
+
parsed = datetime.fromisoformat(normalized)
|
|
294
|
+
except ValueError:
|
|
295
|
+
return datetime.now(UTC)
|
|
296
|
+
if parsed.tzinfo is None:
|
|
297
|
+
return parsed.replace(tzinfo=UTC)
|
|
298
|
+
return parsed
|
|
299
|
+
|
|
300
|
+
def _transform_item(self, item: dict[str, Any], content_type: str) -> SingleAssetScanResults:
|
|
301
|
+
"""Transform WordPress item to a page URL asset."""
|
|
302
|
+
page_asset, _ = self._transform_item_to_assets(item, content_type)
|
|
303
|
+
return page_asset
|
|
304
|
+
|
|
305
|
+
def _transform_item_to_assets(
|
|
306
|
+
self, item: dict[str, Any], content_type: str
|
|
307
|
+
) -> tuple[SingleAssetScanResults, list[SingleAssetScanResults]]:
|
|
308
|
+
wp_id_value = item.get("id")
|
|
309
|
+
wp_id = str(wp_id_value) if wp_id_value is not None else ""
|
|
310
|
+
slug = str(item.get("slug") or "")
|
|
311
|
+
|
|
312
|
+
page_url = self._build_item_url(item, slug, wp_id)
|
|
313
|
+
page_hash = self.generate_hash_id(page_url)
|
|
314
|
+
|
|
315
|
+
if wp_id:
|
|
316
|
+
self._url_to_wp_id[page_hash] = wp_id
|
|
317
|
+
self._url_to_wp_id[page_url] = wp_id
|
|
318
|
+
|
|
319
|
+
title_obj = item.get("title", {})
|
|
320
|
+
title = title_obj.get("rendered", "") if isinstance(title_obj, dict) else str(title_obj)
|
|
321
|
+
title = self._strip_html(title) or f"WordPress {content_type.rstrip('s')} {wp_id}"
|
|
322
|
+
|
|
323
|
+
excerpt_obj = item.get("excerpt", {})
|
|
324
|
+
excerpt = excerpt_obj.get("rendered", "") if isinstance(excerpt_obj, dict) else ""
|
|
325
|
+
excerpt = self._strip_html(excerpt)[:200]
|
|
326
|
+
|
|
327
|
+
content_obj = item.get("content", {})
|
|
328
|
+
html_content = content_obj.get("rendered", "") if isinstance(content_obj, dict) else ""
|
|
329
|
+
|
|
330
|
+
image_urls, link_urls = self._extract_related_urls(html_content)
|
|
331
|
+
image_hashes = [self.generate_hash_id(url) for url in image_urls]
|
|
332
|
+
link_hashes = [self.generate_hash_id(url) for url in link_urls]
|
|
333
|
+
page_links = self._unique_preserve_order([*image_hashes, *link_hashes])
|
|
334
|
+
|
|
335
|
+
created_at_str = self._parse_wordpress_date(item.get("date_gmt", item.get("date")))
|
|
336
|
+
updated_at_str = self._parse_wordpress_date(item.get("modified_gmt", item.get("modified")))
|
|
337
|
+
created_dt = self._parse_datetime(created_at_str)
|
|
338
|
+
updated_dt = self._parse_datetime(updated_at_str)
|
|
339
|
+
|
|
340
|
+
metadata = {
|
|
341
|
+
"wp_id": wp_id,
|
|
342
|
+
"title": title,
|
|
343
|
+
"slug": slug,
|
|
344
|
+
"status": item.get("status"),
|
|
345
|
+
"modified": updated_dt.isoformat(),
|
|
346
|
+
"excerpt": excerpt[:100] if excerpt else None,
|
|
347
|
+
"images_count": len(image_urls),
|
|
348
|
+
"links_count": len(link_urls),
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
page_asset = SingleAssetScanResults(
|
|
352
|
+
hash=page_hash,
|
|
353
|
+
checksum=self.calculate_checksum(metadata),
|
|
354
|
+
name=title,
|
|
355
|
+
external_url=page_url,
|
|
356
|
+
links=page_links,
|
|
357
|
+
asset_type=OutputAssetType.URL,
|
|
358
|
+
source_id=self.source_id,
|
|
359
|
+
created_at=created_dt,
|
|
360
|
+
updated_at=updated_dt,
|
|
361
|
+
runner_id=self.runner_id,
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
image_assets = [
|
|
365
|
+
self._make_image_asset(
|
|
366
|
+
image_url=image_url,
|
|
367
|
+
image_hash=image_hash,
|
|
368
|
+
page_hash=page_hash,
|
|
369
|
+
created_at=created_dt,
|
|
370
|
+
updated_at=updated_dt,
|
|
371
|
+
)
|
|
372
|
+
for image_url, image_hash in zip(image_urls, image_hashes, strict=False)
|
|
373
|
+
]
|
|
374
|
+
|
|
375
|
+
return page_asset, image_assets
|
|
376
|
+
|
|
377
|
+
def _build_item_url(self, item: dict[str, Any], slug: str, wp_id: str) -> str:
|
|
378
|
+
link = item.get("link")
|
|
379
|
+
if isinstance(link, str) and link.strip():
|
|
380
|
+
return link
|
|
381
|
+
|
|
382
|
+
if slug:
|
|
383
|
+
return f"{self.site_base_url}/{slug.lstrip('/')}"
|
|
384
|
+
|
|
385
|
+
if wp_id:
|
|
386
|
+
return f"{self.site_base_url}/?p={wp_id}"
|
|
387
|
+
|
|
388
|
+
return self.site_base_url
|
|
389
|
+
|
|
390
|
+
def _extract_related_urls(self, html_content: str) -> tuple[list[str], list[str]]:
|
|
391
|
+
if not html_content:
|
|
392
|
+
return [], []
|
|
393
|
+
|
|
394
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
395
|
+
image_urls: list[str] = []
|
|
396
|
+
link_urls: list[str] = []
|
|
397
|
+
|
|
398
|
+
for image in soup.find_all("img"):
|
|
399
|
+
src = image.get("src")
|
|
400
|
+
if isinstance(src, str):
|
|
401
|
+
normalized = self._normalize_external_url(src)
|
|
402
|
+
if normalized:
|
|
403
|
+
image_urls.append(normalized)
|
|
404
|
+
|
|
405
|
+
for anchor in soup.find_all("a"):
|
|
406
|
+
href = anchor.get("href")
|
|
407
|
+
if isinstance(href, str):
|
|
408
|
+
normalized = self._normalize_external_url(href)
|
|
409
|
+
if normalized:
|
|
410
|
+
link_urls.append(normalized)
|
|
411
|
+
|
|
412
|
+
return (
|
|
413
|
+
self._unique_preserve_order(image_urls),
|
|
414
|
+
self._unique_preserve_order(link_urls),
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
def _normalize_external_url(self, raw_url: str) -> str | None:
|
|
418
|
+
return normalize_http_url(raw_url, base_url=self.site_base_url)
|
|
419
|
+
|
|
420
|
+
def _make_image_asset(
|
|
421
|
+
self,
|
|
422
|
+
*,
|
|
423
|
+
image_url: str,
|
|
424
|
+
image_hash: str,
|
|
425
|
+
page_hash: str,
|
|
426
|
+
created_at: datetime,
|
|
427
|
+
updated_at: datetime,
|
|
428
|
+
) -> SingleAssetScanResults:
|
|
429
|
+
image_name = self._image_name_from_url(image_url)
|
|
430
|
+
metadata = {
|
|
431
|
+
"url": image_url,
|
|
432
|
+
"referenced_by": page_hash,
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
return SingleAssetScanResults(
|
|
436
|
+
hash=image_hash,
|
|
437
|
+
checksum=self.calculate_checksum(metadata),
|
|
438
|
+
name=image_name,
|
|
439
|
+
external_url=image_url,
|
|
440
|
+
links=[],
|
|
441
|
+
asset_type=OutputAssetType.IMAGE,
|
|
442
|
+
source_id=self.source_id,
|
|
443
|
+
created_at=created_at,
|
|
444
|
+
updated_at=updated_at,
|
|
445
|
+
runner_id=self.runner_id,
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
def _image_name_from_url(self, image_url: str) -> str:
|
|
449
|
+
parsed = urlsplit(image_url)
|
|
450
|
+
file_name = parsed.path.rstrip("/").split("/")[-1]
|
|
451
|
+
return f"Image: {file_name}" if file_name else f"Image: {image_url}"
|
|
452
|
+
|
|
453
|
+
def _unique_preserve_order(self, values: list[str]) -> list[str]:
|
|
454
|
+
seen: set[str] = set()
|
|
455
|
+
unique_values: list[str] = []
|
|
456
|
+
for value in values:
|
|
457
|
+
if value in seen:
|
|
458
|
+
continue
|
|
459
|
+
seen.add(value)
|
|
460
|
+
unique_values.append(value)
|
|
461
|
+
return unique_values
|
|
462
|
+
|
|
463
|
+
def _strip_html(self, html: str) -> str:
|
|
464
|
+
"""Strip HTML tags from string."""
|
|
465
|
+
return re.sub(HTML_TAG_RE, "", html).strip()
|
|
466
|
+
|
|
467
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
468
|
+
"""
|
|
469
|
+
Fetch full content for a WordPress URL asset (for detector scanning).
|
|
470
|
+
"""
|
|
471
|
+
try:
|
|
472
|
+
from ...utils.content_extraction import html_to_text
|
|
473
|
+
|
|
474
|
+
logger.info(f"Fetching content for WordPress asset {asset_id}")
|
|
475
|
+
|
|
476
|
+
html_content: str | None = None
|
|
477
|
+
wp_id = self._resolve_wordpress_item_id(asset_id)
|
|
478
|
+
|
|
479
|
+
if wp_id:
|
|
480
|
+
html_content = self._fetch_content_by_wp_id(wp_id)
|
|
481
|
+
|
|
482
|
+
if not html_content:
|
|
483
|
+
normalized_url = normalize_http_url(asset_id, base_url=self.site_base_url)
|
|
484
|
+
if normalized_url:
|
|
485
|
+
html_content = self._fetch_content_by_url(normalized_url)
|
|
486
|
+
|
|
487
|
+
if not html_content:
|
|
488
|
+
logger.warning(f"No content found for asset {asset_id}")
|
|
489
|
+
return None
|
|
490
|
+
|
|
491
|
+
text_content = html_to_text(html_content)
|
|
492
|
+
logger.debug(
|
|
493
|
+
f"Fetched {len(html_content)} bytes of HTML, "
|
|
494
|
+
f"extracted {len(text_content)} bytes of text"
|
|
495
|
+
)
|
|
496
|
+
return html_content, text_content
|
|
497
|
+
|
|
498
|
+
except Exception as e:
|
|
499
|
+
logger.error(f"Failed to fetch content for WordPress asset {asset_id}: {e}")
|
|
500
|
+
return None
|
|
501
|
+
|
|
502
|
+
def _resolve_wordpress_item_id(self, asset_id: str) -> str | None:
|
|
503
|
+
normalized = normalize_http_url(asset_id, base_url=self.site_base_url)
|
|
504
|
+
if normalized and normalized in self._url_to_wp_id:
|
|
505
|
+
return self._url_to_wp_id[normalized]
|
|
506
|
+
if asset_id in self._url_to_wp_id:
|
|
507
|
+
return self._url_to_wp_id[asset_id]
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
decoded = unhash_id(asset_id)
|
|
511
|
+
except Exception:
|
|
512
|
+
return None
|
|
513
|
+
|
|
514
|
+
parts = decoded.split("_#_")
|
|
515
|
+
for part in reversed(parts):
|
|
516
|
+
if part.isdigit():
|
|
517
|
+
return part
|
|
518
|
+
return None
|
|
519
|
+
|
|
520
|
+
def _fetch_content_by_wp_id(self, wp_id: str) -> str | None:
|
|
521
|
+
response = None
|
|
522
|
+
for endpoint in ["posts", "pages"]:
|
|
523
|
+
try:
|
|
524
|
+
url = f"{self.api_base}/{endpoint}/{wp_id}"
|
|
525
|
+
response = self.session.get(url, timeout=10)
|
|
526
|
+
response.raise_for_status()
|
|
527
|
+
break
|
|
528
|
+
except requests.exceptions.RequestException:
|
|
529
|
+
continue
|
|
530
|
+
|
|
531
|
+
if not response or not response.ok:
|
|
532
|
+
return None
|
|
533
|
+
|
|
534
|
+
data = response.json()
|
|
535
|
+
content_obj = data.get("content", {})
|
|
536
|
+
html_content = content_obj.get("rendered", "")
|
|
537
|
+
if not isinstance(html_content, str) or not html_content:
|
|
538
|
+
return None
|
|
539
|
+
|
|
540
|
+
return html_content
|
|
541
|
+
|
|
542
|
+
def _fetch_content_by_url(self, url: str) -> str | None:
|
|
543
|
+
try:
|
|
544
|
+
response = self.session.get(url, timeout=10)
|
|
545
|
+
response.raise_for_status()
|
|
546
|
+
except requests.exceptions.RequestException:
|
|
547
|
+
return None
|
|
548
|
+
|
|
549
|
+
content_type = response.headers.get("Content-Type", "").lower()
|
|
550
|
+
if content_type.startswith("image/"):
|
|
551
|
+
return None
|
|
552
|
+
if "html" in content_type or not content_type:
|
|
553
|
+
return response.text
|
|
554
|
+
return None
|
|
555
|
+
|
|
556
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
557
|
+
"""Generate a stable hash ID for URL-like assets and keep reverse lookup for detectors."""
|
|
558
|
+
normalized = normalize_http_url(asset_id, base_url=self.site_base_url)
|
|
559
|
+
if not normalized:
|
|
560
|
+
raise ValueError(f"Invalid URL for hash: {asset_id}")
|
|
561
|
+
|
|
562
|
+
asset_hash = hash_url(normalized, base_url=self.site_base_url)
|
|
563
|
+
self._hash_to_url[asset_hash] = normalized
|
|
564
|
+
return asset_hash
|
|
565
|
+
|
|
566
|
+
def resolve_link_for_detection(self, link: str) -> str | None:
|
|
567
|
+
mapped = self._hash_to_url.get(link)
|
|
568
|
+
if mapped:
|
|
569
|
+
return mapped
|
|
570
|
+
return normalize_http_url(link)
|
|
571
|
+
|
|
572
|
+
def enrich_finding_location(
|
|
573
|
+
self,
|
|
574
|
+
finding: DetectionResult,
|
|
575
|
+
asset: SingleAssetScanResults,
|
|
576
|
+
text_content: str,
|
|
577
|
+
) -> None:
|
|
578
|
+
finding.location = Location(path=asset.external_url)
|
|
579
|
+
|
|
580
|
+
def abort(self) -> None:
|
|
581
|
+
"""Signal the source to stop extraction."""
|
|
582
|
+
logger.info("Aborting WordPress extraction...")
|
|
583
|
+
super().abort()
|
|
584
|
+
if hasattr(self, "session"):
|
|
585
|
+
self.session.close()
|
|
586
|
+
|
|
587
|
+
def cleanup(self) -> None:
|
|
588
|
+
"""Clean up resources."""
|
|
589
|
+
if hasattr(self, "session"):
|
|
590
|
+
self.session.close()
|
src/telemetry.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenTelemetry initialisation for ephemeral Kubernetes CLI jobs.
|
|
3
|
+
|
|
4
|
+
Key design points:
|
|
5
|
+
- BatchSpanProcessor with 2 s delay so spans export before the job finishes.
|
|
6
|
+
- Explicit force_flush() + SIGTERM handler — Python's default SIGTERM kills
|
|
7
|
+
the process before atexit hooks run, which would silently drop spans.
|
|
8
|
+
- Graceful no-op when OTel packages are not installed or telemetry is disabled.
|
|
9
|
+
|
|
10
|
+
Opt-out: set TELEMETRY_DISABLED=1 or DO_NOT_TRACK=1 in the environment.
|
|
11
|
+
Install: uv sync --group otel
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import atexit
|
|
17
|
+
import os
|
|
18
|
+
import signal
|
|
19
|
+
import sys
|
|
20
|
+
from functools import partial
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def is_telemetry_enabled() -> bool:
|
|
25
|
+
if os.getenv("TELEMETRY_DISABLED") == "1":
|
|
26
|
+
return False
|
|
27
|
+
if os.getenv("DO_NOT_TRACK") == "1":
|
|
28
|
+
return False
|
|
29
|
+
if os.getenv("CLASSIFYRE_TELEMETRY", "true").lower() == "false":
|
|
30
|
+
return False
|
|
31
|
+
return True
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def init_telemetry() -> Any:
|
|
35
|
+
"""
|
|
36
|
+
Initialise the OTel SDK and return the TracerProvider.
|
|
37
|
+
|
|
38
|
+
Returns a no-op provider when telemetry is disabled or the OTel SDK is
|
|
39
|
+
not installed (the ``otel`` optional dependency group was not installed).
|
|
40
|
+
"""
|
|
41
|
+
if not is_telemetry_enabled():
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
from opentelemetry import trace
|
|
46
|
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
|
47
|
+
OTLPSpanExporter,
|
|
48
|
+
)
|
|
49
|
+
from opentelemetry.sdk.resources import Resource
|
|
50
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
51
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
52
|
+
except ImportError:
|
|
53
|
+
# OTel optional group not installed — run without telemetry.
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
resource = Resource.create(
|
|
57
|
+
{
|
|
58
|
+
"service.name": os.getenv("OTEL_SERVICE_NAME", "classifyre-cli"),
|
|
59
|
+
"service.version": os.getenv("SERVICE_VERSION", "0.0.0"),
|
|
60
|
+
"deployment.environment.name": os.getenv(
|
|
61
|
+
"DEPLOY_ENV", os.getenv("NODE_ENV", "production")
|
|
62
|
+
),
|
|
63
|
+
"service.namespace": "classifyre",
|
|
64
|
+
"service.instance.id": os.getenv("CLASSIFYRE_INSTANCE_ID", ""),
|
|
65
|
+
}
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
provider = TracerProvider(resource=resource)
|
|
69
|
+
provider.add_span_processor(
|
|
70
|
+
BatchSpanProcessor(
|
|
71
|
+
OTLPSpanExporter(),
|
|
72
|
+
# Reduced delay so spans export before a short-lived job exits.
|
|
73
|
+
schedule_delay_millis=2_000,
|
|
74
|
+
max_queue_size=512,
|
|
75
|
+
max_export_batch_size=256,
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
trace.set_tracer_provider(provider)
|
|
79
|
+
|
|
80
|
+
_shutdown_state: dict[str, bool] = {"called": False}
|
|
81
|
+
|
|
82
|
+
def _shutdown(timeout_ms: int = 10_000) -> None:
|
|
83
|
+
if _shutdown_state["called"]:
|
|
84
|
+
return
|
|
85
|
+
_shutdown_state["called"] = True
|
|
86
|
+
provider.force_flush(timeout_millis=timeout_ms)
|
|
87
|
+
provider.shutdown()
|
|
88
|
+
|
|
89
|
+
def _handle_sigterm(signum: int, frame: Any) -> None: # noqa: ARG001
|
|
90
|
+
_shutdown(timeout_ms=8_000)
|
|
91
|
+
sys.exit(143)
|
|
92
|
+
|
|
93
|
+
atexit.register(_shutdown)
|
|
94
|
+
signal.signal(signal.SIGTERM, partial(_handle_sigterm))
|
|
95
|
+
|
|
96
|
+
return provider
|
src/utils/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Utility functions."""
|