classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,590 @@
1
+ import logging
2
+ import re
3
+ from collections.abc import AsyncGenerator, Generator
4
+ from datetime import UTC, datetime
5
+ from typing import Any
6
+ from urllib.parse import urlsplit
7
+
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+
11
+ from ...models.generated_input import SamplingStrategy, WordPressInput, WordPressOptionalContent
12
+ from ...models.generated_single_asset_scan_results import (
13
+ AssetType as OutputAssetType,
14
+ )
15
+ from ...models.generated_single_asset_scan_results import (
16
+ DetectionResult,
17
+ Location,
18
+ SingleAssetScanResults,
19
+ )
20
+ from ...utils.hashing import hash_url, normalize_http_url, unhash_id
21
+ from ..base import BaseSource
22
+
23
+ logger = logging.getLogger(__name__)
24
+ HTML_TAG_RE = re.compile("<.*?>")
25
+
26
+
27
+ class WordPressSource(BaseSource):
28
+ source_type = "wordpress"
29
+
30
+ def __init__(
31
+ self,
32
+ recipe: dict[str, Any],
33
+ source_id: str | None = None,
34
+ runner_id: str | None = None,
35
+ ):
36
+ super().__init__(recipe, source_id=source_id, runner_id=runner_id)
37
+ self.config = WordPressInput.model_validate(recipe)
38
+ self.runner_id = runner_id or "local-run"
39
+
40
+ self.site_base_url = str(self.config.required.url).rstrip("/")
41
+ self.api_base = f"{self.site_base_url}/wp-json/wp/v2"
42
+
43
+ self._url_to_wp_id: dict[str, str] = {}
44
+ self._hash_to_url: dict[str, str] = {}
45
+ self._seen_asset_hashes: set[str] = set()
46
+
47
+ self.session = requests.Session()
48
+
49
+ if self.config.masked.username and self.config.masked.application_password:
50
+ self.session.auth = (
51
+ self.config.masked.username,
52
+ self.config.masked.application_password,
53
+ )
54
+
55
+ logger.info(f"Initialized WordPress source for {self.config.required.url}")
56
+
57
+ def _content_options(self) -> WordPressOptionalContent:
58
+ if self.config.optional and self.config.optional.content:
59
+ return self.config.optional.content
60
+ return WordPressOptionalContent()
61
+
62
+ def test_connection(self) -> dict[str, Any]:
63
+ """Test connectivity to WordPress REST API."""
64
+ logger.info(f"Testing connection to WordPress at {self.config.required.url}...")
65
+
66
+ result = {
67
+ "timestamp": datetime.now(UTC).isoformat(),
68
+ "source_type": self.recipe.get("type"),
69
+ }
70
+
71
+ try:
72
+ response = self.session.get(
73
+ f"{self.api_base}/posts", params={"per_page": 1}, timeout=10
74
+ )
75
+ if response.status_code == 200:
76
+ result["status"] = "SUCCESS"
77
+ result["message"] = "Successfully connected to WordPress REST API."
78
+ logger.info("Connection test successful")
79
+ elif response.status_code in (401, 403):
80
+ result["status"] = "SUCCESS"
81
+ result["message"] = (
82
+ "WordPress REST API is reachable, but authentication is required for "
83
+ "private content."
84
+ )
85
+ logger.info("Connection test successful (authentication required)")
86
+ else:
87
+ result["status"] = "FAILURE"
88
+ result["message"] = (
89
+ f"Unexpected status from WordPress REST API: {response.status_code}"
90
+ )
91
+ logger.error(result["message"])
92
+ except requests.exceptions.RequestException as e:
93
+ result["status"] = "FAILURE"
94
+ result["message"] = f"Failed to connect: {e!s}"
95
+ logger.error(f"Connection test failed: {e}")
96
+
97
+ return result
98
+
99
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
100
+ """Extract posts and pages from WordPress."""
101
+ if self._aborted:
102
+ return
103
+
104
+ logger.info("Extracting metadata from WordPress...")
105
+
106
+ self._url_to_wp_id = {}
107
+ self._hash_to_url = {}
108
+ self._seen_asset_hashes = set()
109
+
110
+ pending_batch: list[SingleAssetScanResults] = []
111
+ content_options = self._content_options()
112
+ sampling = self.config.sampling
113
+ limit: int | None = (
114
+ None
115
+ if sampling.strategy == SamplingStrategy.ALL
116
+ else int(sampling.rows_per_page or 100)
117
+ )
118
+ total_items_extracted = 0
119
+
120
+ if content_options.fetch_posts is not False:
121
+ posts_count = 0
122
+ posts_assets = 0
123
+ for assets_chunk, items_count in self._stream_content_type(
124
+ "posts",
125
+ limit - total_items_extracted if limit else None,
126
+ sampling.strategy,
127
+ ):
128
+ posts_count += items_count
129
+ total_items_extracted += items_count
130
+ posts_assets += len(assets_chunk)
131
+
132
+ for asset in assets_chunk:
133
+ pending_batch.append(asset)
134
+ while len(pending_batch) >= self.BATCH_SIZE:
135
+ to_emit = pending_batch[: self.BATCH_SIZE]
136
+ pending_batch = pending_batch[self.BATCH_SIZE :]
137
+ if to_emit:
138
+ yield to_emit
139
+
140
+ logger.info(f"Extracted {posts_count} posts into {posts_assets} assets")
141
+
142
+ if content_options.fetch_pages is not False and (
143
+ not limit or total_items_extracted < limit
144
+ ):
145
+ pages_count = 0
146
+ pages_assets = 0
147
+ for assets_chunk, items_count in self._stream_content_type(
148
+ "pages",
149
+ limit - total_items_extracted if limit else None,
150
+ sampling.strategy,
151
+ ):
152
+ pages_count += items_count
153
+ total_items_extracted += items_count
154
+ pages_assets += len(assets_chunk)
155
+
156
+ for asset in assets_chunk:
157
+ pending_batch.append(asset)
158
+ while len(pending_batch) >= self.BATCH_SIZE:
159
+ to_emit = pending_batch[: self.BATCH_SIZE]
160
+ pending_batch = pending_batch[self.BATCH_SIZE :]
161
+ if to_emit:
162
+ yield to_emit
163
+
164
+ logger.info(f"Extracted {pages_count} pages into {pages_assets} assets")
165
+
166
+ if pending_batch:
167
+ yield pending_batch
168
+
169
+ logger.info("Total extracted WordPress items: %s", total_items_extracted)
170
+
171
+ def _stream_content_type(
172
+ self,
173
+ content_type: str,
174
+ limit: int | None,
175
+ strategy: SamplingStrategy = SamplingStrategy.LATEST,
176
+ ) -> Generator[tuple[list[SingleAssetScanResults], int], None, None]:
177
+ """Stream transformed assets for a content type while paginating the API."""
178
+ endpoint = f"{self.api_base}/{content_type}"
179
+ items_extracted = 0
180
+ page = 1
181
+ per_page = 100
182
+
183
+ while True:
184
+ if self._aborted or (limit and items_extracted >= limit):
185
+ break
186
+
187
+ current_per_page = per_page
188
+ if limit:
189
+ current_per_page = min(per_page, limit - items_extracted)
190
+
191
+ params: dict[str, Any] = {
192
+ "per_page": current_per_page,
193
+ "page": page,
194
+ "_embed": "author,wp:term",
195
+ }
196
+
197
+ # For LATEST strategy, request newest items first; RANDOM is not directly
198
+ # supported by WP API so we default to modified date ordering
199
+ if strategy == SamplingStrategy.LATEST:
200
+ params["orderby"] = "modified"
201
+ params["order"] = "desc"
202
+
203
+ content_options = self._content_options()
204
+ if content_options.post_status:
205
+ params["status"] = ",".join(content_options.post_status)
206
+
207
+ try:
208
+ response = self.session.get(endpoint, params=params, timeout=30)
209
+ response.raise_for_status()
210
+
211
+ items = response.json()
212
+ if not items:
213
+ break
214
+
215
+ total_items = int(response.headers.get("X-WP-Total", 0))
216
+ total_pages = int(response.headers.get("X-WP-TotalPages", 1))
217
+
218
+ logger.info(
219
+ f"Fetching {content_type} page {page}/{total_pages} "
220
+ f"({len(items)} items, total: {total_items})"
221
+ )
222
+
223
+ page_assets: list[SingleAssetScanResults] = []
224
+ page_items_extracted = 0
225
+ for item in items:
226
+ if self._aborted or (limit and items_extracted >= limit):
227
+ break
228
+
229
+ try:
230
+ page_asset, image_assets = self._transform_item_to_assets(
231
+ item, content_type
232
+ )
233
+ self._add_asset_if_new(page_assets, page_asset)
234
+ for image_asset in image_assets:
235
+ self._add_asset_if_new(page_assets, image_asset)
236
+ items_extracted += 1
237
+ page_items_extracted += 1
238
+ except Exception as e:
239
+ logger.error(
240
+ f"Failed to transform {content_type} item {item.get('id')}: {e}"
241
+ )
242
+ continue
243
+
244
+ if page_items_extracted > 0:
245
+ yield page_assets, page_items_extracted
246
+
247
+ if page >= total_pages or len(items) < current_per_page:
248
+ break
249
+
250
+ page += 1
251
+
252
+ except requests.exceptions.RequestException as e:
253
+ logger.error(f"Failed to fetch {content_type} page {page}: {e}")
254
+ break
255
+
256
+ def _fetch_content_type(
257
+ self, content_type: str, limit: int | None
258
+ ) -> tuple[list[SingleAssetScanResults], int]:
259
+ """Compatibility helper used by tests; collects stream into memory."""
260
+ results: list[SingleAssetScanResults] = []
261
+ items_extracted = 0
262
+
263
+ for assets, extracted_count in self._stream_content_type(
264
+ content_type, limit, self.config.sampling.strategy
265
+ ):
266
+ results.extend(assets)
267
+ items_extracted += extracted_count
268
+ return results, items_extracted
269
+
270
+ def _add_asset_if_new(
271
+ self, results: list[SingleAssetScanResults], asset: SingleAssetScanResults
272
+ ) -> None:
273
+ if asset.hash in self._seen_asset_hashes:
274
+ return
275
+ self._seen_asset_hashes.add(asset.hash)
276
+ results.append(asset)
277
+
278
+ def _parse_wordpress_date(self, date_str: str | None) -> str | None:
279
+ """Parse WordPress date and ensure it has timezone info."""
280
+ if not date_str:
281
+ return None
282
+
283
+ if "+" in date_str or date_str.endswith("Z"):
284
+ return date_str
285
+
286
+ return f"{date_str}+00:00"
287
+
288
+ def _parse_datetime(self, date_str: str | None) -> datetime:
289
+ if not date_str:
290
+ return datetime.now(UTC)
291
+ normalized = date_str.replace("Z", "+00:00")
292
+ try:
293
+ parsed = datetime.fromisoformat(normalized)
294
+ except ValueError:
295
+ return datetime.now(UTC)
296
+ if parsed.tzinfo is None:
297
+ return parsed.replace(tzinfo=UTC)
298
+ return parsed
299
+
300
+ def _transform_item(self, item: dict[str, Any], content_type: str) -> SingleAssetScanResults:
301
+ """Transform WordPress item to a page URL asset."""
302
+ page_asset, _ = self._transform_item_to_assets(item, content_type)
303
+ return page_asset
304
+
305
+ def _transform_item_to_assets(
306
+ self, item: dict[str, Any], content_type: str
307
+ ) -> tuple[SingleAssetScanResults, list[SingleAssetScanResults]]:
308
+ wp_id_value = item.get("id")
309
+ wp_id = str(wp_id_value) if wp_id_value is not None else ""
310
+ slug = str(item.get("slug") or "")
311
+
312
+ page_url = self._build_item_url(item, slug, wp_id)
313
+ page_hash = self.generate_hash_id(page_url)
314
+
315
+ if wp_id:
316
+ self._url_to_wp_id[page_hash] = wp_id
317
+ self._url_to_wp_id[page_url] = wp_id
318
+
319
+ title_obj = item.get("title", {})
320
+ title = title_obj.get("rendered", "") if isinstance(title_obj, dict) else str(title_obj)
321
+ title = self._strip_html(title) or f"WordPress {content_type.rstrip('s')} {wp_id}"
322
+
323
+ excerpt_obj = item.get("excerpt", {})
324
+ excerpt = excerpt_obj.get("rendered", "") if isinstance(excerpt_obj, dict) else ""
325
+ excerpt = self._strip_html(excerpt)[:200]
326
+
327
+ content_obj = item.get("content", {})
328
+ html_content = content_obj.get("rendered", "") if isinstance(content_obj, dict) else ""
329
+
330
+ image_urls, link_urls = self._extract_related_urls(html_content)
331
+ image_hashes = [self.generate_hash_id(url) for url in image_urls]
332
+ link_hashes = [self.generate_hash_id(url) for url in link_urls]
333
+ page_links = self._unique_preserve_order([*image_hashes, *link_hashes])
334
+
335
+ created_at_str = self._parse_wordpress_date(item.get("date_gmt", item.get("date")))
336
+ updated_at_str = self._parse_wordpress_date(item.get("modified_gmt", item.get("modified")))
337
+ created_dt = self._parse_datetime(created_at_str)
338
+ updated_dt = self._parse_datetime(updated_at_str)
339
+
340
+ metadata = {
341
+ "wp_id": wp_id,
342
+ "title": title,
343
+ "slug": slug,
344
+ "status": item.get("status"),
345
+ "modified": updated_dt.isoformat(),
346
+ "excerpt": excerpt[:100] if excerpt else None,
347
+ "images_count": len(image_urls),
348
+ "links_count": len(link_urls),
349
+ }
350
+
351
+ page_asset = SingleAssetScanResults(
352
+ hash=page_hash,
353
+ checksum=self.calculate_checksum(metadata),
354
+ name=title,
355
+ external_url=page_url,
356
+ links=page_links,
357
+ asset_type=OutputAssetType.URL,
358
+ source_id=self.source_id,
359
+ created_at=created_dt,
360
+ updated_at=updated_dt,
361
+ runner_id=self.runner_id,
362
+ )
363
+
364
+ image_assets = [
365
+ self._make_image_asset(
366
+ image_url=image_url,
367
+ image_hash=image_hash,
368
+ page_hash=page_hash,
369
+ created_at=created_dt,
370
+ updated_at=updated_dt,
371
+ )
372
+ for image_url, image_hash in zip(image_urls, image_hashes, strict=False)
373
+ ]
374
+
375
+ return page_asset, image_assets
376
+
377
+ def _build_item_url(self, item: dict[str, Any], slug: str, wp_id: str) -> str:
378
+ link = item.get("link")
379
+ if isinstance(link, str) and link.strip():
380
+ return link
381
+
382
+ if slug:
383
+ return f"{self.site_base_url}/{slug.lstrip('/')}"
384
+
385
+ if wp_id:
386
+ return f"{self.site_base_url}/?p={wp_id}"
387
+
388
+ return self.site_base_url
389
+
390
+ def _extract_related_urls(self, html_content: str) -> tuple[list[str], list[str]]:
391
+ if not html_content:
392
+ return [], []
393
+
394
+ soup = BeautifulSoup(html_content, "html.parser")
395
+ image_urls: list[str] = []
396
+ link_urls: list[str] = []
397
+
398
+ for image in soup.find_all("img"):
399
+ src = image.get("src")
400
+ if isinstance(src, str):
401
+ normalized = self._normalize_external_url(src)
402
+ if normalized:
403
+ image_urls.append(normalized)
404
+
405
+ for anchor in soup.find_all("a"):
406
+ href = anchor.get("href")
407
+ if isinstance(href, str):
408
+ normalized = self._normalize_external_url(href)
409
+ if normalized:
410
+ link_urls.append(normalized)
411
+
412
+ return (
413
+ self._unique_preserve_order(image_urls),
414
+ self._unique_preserve_order(link_urls),
415
+ )
416
+
417
+ def _normalize_external_url(self, raw_url: str) -> str | None:
418
+ return normalize_http_url(raw_url, base_url=self.site_base_url)
419
+
420
+ def _make_image_asset(
421
+ self,
422
+ *,
423
+ image_url: str,
424
+ image_hash: str,
425
+ page_hash: str,
426
+ created_at: datetime,
427
+ updated_at: datetime,
428
+ ) -> SingleAssetScanResults:
429
+ image_name = self._image_name_from_url(image_url)
430
+ metadata = {
431
+ "url": image_url,
432
+ "referenced_by": page_hash,
433
+ }
434
+
435
+ return SingleAssetScanResults(
436
+ hash=image_hash,
437
+ checksum=self.calculate_checksum(metadata),
438
+ name=image_name,
439
+ external_url=image_url,
440
+ links=[],
441
+ asset_type=OutputAssetType.IMAGE,
442
+ source_id=self.source_id,
443
+ created_at=created_at,
444
+ updated_at=updated_at,
445
+ runner_id=self.runner_id,
446
+ )
447
+
448
+ def _image_name_from_url(self, image_url: str) -> str:
449
+ parsed = urlsplit(image_url)
450
+ file_name = parsed.path.rstrip("/").split("/")[-1]
451
+ return f"Image: {file_name}" if file_name else f"Image: {image_url}"
452
+
453
+ def _unique_preserve_order(self, values: list[str]) -> list[str]:
454
+ seen: set[str] = set()
455
+ unique_values: list[str] = []
456
+ for value in values:
457
+ if value in seen:
458
+ continue
459
+ seen.add(value)
460
+ unique_values.append(value)
461
+ return unique_values
462
+
463
+ def _strip_html(self, html: str) -> str:
464
+ """Strip HTML tags from string."""
465
+ return re.sub(HTML_TAG_RE, "", html).strip()
466
+
467
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
468
+ """
469
+ Fetch full content for a WordPress URL asset (for detector scanning).
470
+ """
471
+ try:
472
+ from ...utils.content_extraction import html_to_text
473
+
474
+ logger.info(f"Fetching content for WordPress asset {asset_id}")
475
+
476
+ html_content: str | None = None
477
+ wp_id = self._resolve_wordpress_item_id(asset_id)
478
+
479
+ if wp_id:
480
+ html_content = self._fetch_content_by_wp_id(wp_id)
481
+
482
+ if not html_content:
483
+ normalized_url = normalize_http_url(asset_id, base_url=self.site_base_url)
484
+ if normalized_url:
485
+ html_content = self._fetch_content_by_url(normalized_url)
486
+
487
+ if not html_content:
488
+ logger.warning(f"No content found for asset {asset_id}")
489
+ return None
490
+
491
+ text_content = html_to_text(html_content)
492
+ logger.debug(
493
+ f"Fetched {len(html_content)} bytes of HTML, "
494
+ f"extracted {len(text_content)} bytes of text"
495
+ )
496
+ return html_content, text_content
497
+
498
+ except Exception as e:
499
+ logger.error(f"Failed to fetch content for WordPress asset {asset_id}: {e}")
500
+ return None
501
+
502
+ def _resolve_wordpress_item_id(self, asset_id: str) -> str | None:
503
+ normalized = normalize_http_url(asset_id, base_url=self.site_base_url)
504
+ if normalized and normalized in self._url_to_wp_id:
505
+ return self._url_to_wp_id[normalized]
506
+ if asset_id in self._url_to_wp_id:
507
+ return self._url_to_wp_id[asset_id]
508
+
509
+ try:
510
+ decoded = unhash_id(asset_id)
511
+ except Exception:
512
+ return None
513
+
514
+ parts = decoded.split("_#_")
515
+ for part in reversed(parts):
516
+ if part.isdigit():
517
+ return part
518
+ return None
519
+
520
+ def _fetch_content_by_wp_id(self, wp_id: str) -> str | None:
521
+ response = None
522
+ for endpoint in ["posts", "pages"]:
523
+ try:
524
+ url = f"{self.api_base}/{endpoint}/{wp_id}"
525
+ response = self.session.get(url, timeout=10)
526
+ response.raise_for_status()
527
+ break
528
+ except requests.exceptions.RequestException:
529
+ continue
530
+
531
+ if not response or not response.ok:
532
+ return None
533
+
534
+ data = response.json()
535
+ content_obj = data.get("content", {})
536
+ html_content = content_obj.get("rendered", "")
537
+ if not isinstance(html_content, str) or not html_content:
538
+ return None
539
+
540
+ return html_content
541
+
542
+ def _fetch_content_by_url(self, url: str) -> str | None:
543
+ try:
544
+ response = self.session.get(url, timeout=10)
545
+ response.raise_for_status()
546
+ except requests.exceptions.RequestException:
547
+ return None
548
+
549
+ content_type = response.headers.get("Content-Type", "").lower()
550
+ if content_type.startswith("image/"):
551
+ return None
552
+ if "html" in content_type or not content_type:
553
+ return response.text
554
+ return None
555
+
556
+ def generate_hash_id(self, asset_id: str) -> str:
557
+ """Generate a stable hash ID for URL-like assets and keep reverse lookup for detectors."""
558
+ normalized = normalize_http_url(asset_id, base_url=self.site_base_url)
559
+ if not normalized:
560
+ raise ValueError(f"Invalid URL for hash: {asset_id}")
561
+
562
+ asset_hash = hash_url(normalized, base_url=self.site_base_url)
563
+ self._hash_to_url[asset_hash] = normalized
564
+ return asset_hash
565
+
566
+ def resolve_link_for_detection(self, link: str) -> str | None:
567
+ mapped = self._hash_to_url.get(link)
568
+ if mapped:
569
+ return mapped
570
+ return normalize_http_url(link)
571
+
572
+ def enrich_finding_location(
573
+ self,
574
+ finding: DetectionResult,
575
+ asset: SingleAssetScanResults,
576
+ text_content: str,
577
+ ) -> None:
578
+ finding.location = Location(path=asset.external_url)
579
+
580
+ def abort(self) -> None:
581
+ """Signal the source to stop extraction."""
582
+ logger.info("Aborting WordPress extraction...")
583
+ super().abort()
584
+ if hasattr(self, "session"):
585
+ self.session.close()
586
+
587
+ def cleanup(self) -> None:
588
+ """Clean up resources."""
589
+ if hasattr(self, "session"):
590
+ self.session.close()
src/telemetry.py ADDED
@@ -0,0 +1,96 @@
1
+ """
2
+ OpenTelemetry initialisation for ephemeral Kubernetes CLI jobs.
3
+
4
+ Key design points:
5
+ - BatchSpanProcessor with 2 s delay so spans export before the job finishes.
6
+ - Explicit force_flush() + SIGTERM handler — Python's default SIGTERM kills
7
+ the process before atexit hooks run, which would silently drop spans.
8
+ - Graceful no-op when OTel packages are not installed or telemetry is disabled.
9
+
10
+ Opt-out: set TELEMETRY_DISABLED=1 or DO_NOT_TRACK=1 in the environment.
11
+ Install: uv sync --group otel
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import atexit
17
+ import os
18
+ import signal
19
+ import sys
20
+ from functools import partial
21
+ from typing import Any
22
+
23
+
24
+ def is_telemetry_enabled() -> bool:
25
+ if os.getenv("TELEMETRY_DISABLED") == "1":
26
+ return False
27
+ if os.getenv("DO_NOT_TRACK") == "1":
28
+ return False
29
+ if os.getenv("CLASSIFYRE_TELEMETRY", "true").lower() == "false":
30
+ return False
31
+ return True
32
+
33
+
34
+ def init_telemetry() -> Any:
35
+ """
36
+ Initialise the OTel SDK and return the TracerProvider.
37
+
38
+ Returns a no-op provider when telemetry is disabled or the OTel SDK is
39
+ not installed (the ``otel`` optional dependency group was not installed).
40
+ """
41
+ if not is_telemetry_enabled():
42
+ return None
43
+
44
+ try:
45
+ from opentelemetry import trace
46
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
47
+ OTLPSpanExporter,
48
+ )
49
+ from opentelemetry.sdk.resources import Resource
50
+ from opentelemetry.sdk.trace import TracerProvider
51
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
52
+ except ImportError:
53
+ # OTel optional group not installed — run without telemetry.
54
+ return None
55
+
56
+ resource = Resource.create(
57
+ {
58
+ "service.name": os.getenv("OTEL_SERVICE_NAME", "classifyre-cli"),
59
+ "service.version": os.getenv("SERVICE_VERSION", "0.0.0"),
60
+ "deployment.environment.name": os.getenv(
61
+ "DEPLOY_ENV", os.getenv("NODE_ENV", "production")
62
+ ),
63
+ "service.namespace": "classifyre",
64
+ "service.instance.id": os.getenv("CLASSIFYRE_INSTANCE_ID", ""),
65
+ }
66
+ )
67
+
68
+ provider = TracerProvider(resource=resource)
69
+ provider.add_span_processor(
70
+ BatchSpanProcessor(
71
+ OTLPSpanExporter(),
72
+ # Reduced delay so spans export before a short-lived job exits.
73
+ schedule_delay_millis=2_000,
74
+ max_queue_size=512,
75
+ max_export_batch_size=256,
76
+ )
77
+ )
78
+ trace.set_tracer_provider(provider)
79
+
80
+ _shutdown_state: dict[str, bool] = {"called": False}
81
+
82
+ def _shutdown(timeout_ms: int = 10_000) -> None:
83
+ if _shutdown_state["called"]:
84
+ return
85
+ _shutdown_state["called"] = True
86
+ provider.force_flush(timeout_millis=timeout_ms)
87
+ provider.shutdown()
88
+
89
+ def _handle_sigterm(signum: int, frame: Any) -> None: # noqa: ARG001
90
+ _shutdown(timeout_ms=8_000)
91
+ sys.exit(143)
92
+
93
+ atexit.register(_shutdown)
94
+ signal.signal(signal.SIGTERM, partial(_handle_sigterm))
95
+
96
+ return provider
src/utils/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """Utility functions."""