classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,59 @@
1
+ """ContentProvider that wraps a BaseSource and applies file_parser for binary→text conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from collections.abc import AsyncGenerator
7
+
8
+ from ..models.generated_single_asset_scan_results import DetectionResult, SingleAssetScanResults
9
+ from ..sources.base import BaseSource
10
+
11
+
12
+ class ParsedContentProvider:
13
+ """
14
+ Wraps a BaseSource, providing text pages and raw bytes to the pipeline.
15
+
16
+ Text path: delegates to ``source.fetch_content_pages()`` first. If the source
17
+ returns nothing, falls back to ``source.fetch_content_bytes()`` → ``iter_file_pages()``.
18
+
19
+ Binary path: delegates directly to ``source.fetch_content_bytes()``.
20
+ """
21
+
22
+ def __init__(self, source: BaseSource) -> None:
23
+ self._source = source
24
+
25
+ async def fetch_text_pages(self, asset_id: str) -> AsyncGenerator[str, None]:
26
+ saw_text = False
27
+ async for _raw, text in self._source.fetch_content_pages(asset_id):
28
+ if text:
29
+ saw_text = True
30
+ yield text
31
+
32
+ if saw_text:
33
+ return
34
+
35
+ result = await self._source.fetch_content_bytes(asset_id)
36
+ if result is None:
37
+ return
38
+
39
+ raw_bytes, mime = result
40
+ pages: list[str] = await asyncio.to_thread(
41
+ list,
42
+ self._source.iter_asset_pages(raw_bytes, mime),
43
+ )
44
+ for page in pages:
45
+ yield page
46
+
47
+ async def fetch_bytes(self, asset_id: str) -> tuple[bytes, str] | None:
48
+ return await self._source.fetch_content_bytes(asset_id)
49
+
50
+ def enrich_finding_location(
51
+ self,
52
+ finding: DetectionResult,
53
+ asset: SingleAssetScanResults,
54
+ text_content: str,
55
+ ) -> None:
56
+ self._source.enrich_finding_location(finding, asset, text_content)
57
+
58
+ def resolve_link_for_detection(self, link: str) -> str | None:
59
+ return self._source.resolve_link_for_detection(link)
@@ -0,0 +1,5 @@
1
+ """Sandbox module for running detectors on local files."""
2
+
3
+ from .runner import SandboxRunner
4
+
5
+ __all__ = ["SandboxRunner"]
src/sandbox/runner.py ADDED
@@ -0,0 +1,145 @@
1
+ """SandboxRunner: run detectors on a local file."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ from datetime import UTC, datetime
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from ..models.generated_single_asset_scan_results import DetectionResult
12
+ from ..utils.file_parser import ParsedFile, parse_file
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ _CONTENT_SIZE_LIMIT = 1_048_576 # 1 MB
17
+
18
+
19
+ class SandboxRunner:
20
+ """Run a set of detectors against a single local file."""
21
+
22
+ def __init__(self, detectors_config: list[dict[str, Any]]) -> None:
23
+ self._config = detectors_config
24
+
25
+ def _build_detectors(self) -> list[Any]:
26
+ from ..detectors import get_detector
27
+ from ..detectors.config import parse_detector_config
28
+
29
+ detectors = []
30
+ for item in self._config:
31
+ if not item.get("enabled", True):
32
+ continue
33
+
34
+ detector_type = item.get("type", "").upper()
35
+ raw_config = item.get("config", {})
36
+
37
+ try:
38
+ detector_name, typed_config = parse_detector_config(
39
+ detector_type=detector_type,
40
+ raw_config=raw_config,
41
+ )
42
+
43
+ detector = get_detector(detector_name, typed_config)
44
+ detectors.append(detector)
45
+ logger.info(f"Initialized sandbox detector: {detector_name}")
46
+ except Exception as e:
47
+ logger.error(f"Failed to initialize detector {detector_type}: {e}")
48
+
49
+ return detectors
50
+
51
+ @staticmethod
52
+ def _is_binary_detector(detector: Any) -> bool:
53
+ for ct in detector.get_supported_content_types():
54
+ if ct.startswith(("image/", "audio/", "video/")) or ct == "application/octet-stream":
55
+ return True
56
+ return False
57
+
58
+ @staticmethod
59
+ def _supports_mime(supported: list[str], mime_type: str) -> bool:
60
+ if mime_type in supported:
61
+ return True
62
+ for s in supported:
63
+ if s.endswith("/*") and mime_type.startswith(s[:-1]):
64
+ return True
65
+ return False
66
+
67
+ async def run_async(self, file_path: Path) -> tuple[ParsedFile, list[DetectionResult]]:
68
+ """Parse the file and run all enabled detectors."""
69
+ parsed = parse_file(file_path)
70
+
71
+ detectors = self._build_detectors()
72
+ if not detectors:
73
+ return parsed, []
74
+
75
+ tasks = []
76
+ active_detectors = []
77
+
78
+ if parsed.is_binary:
79
+ raw_bytes = file_path.read_bytes()
80
+ mime_type = parsed.mime_type
81
+ if len(raw_bytes) > _CONTENT_SIZE_LIMIT:
82
+ logger.warning(
83
+ f"Binary content ({len(raw_bytes)} bytes) exceeds limit "
84
+ f"({_CONTENT_SIZE_LIMIT} bytes); truncating."
85
+ )
86
+ raw_bytes = raw_bytes[:_CONTENT_SIZE_LIMIT]
87
+ for detector in detectors:
88
+ if self._is_binary_detector(detector) and self._supports_mime(
89
+ detector.get_supported_content_types(), mime_type
90
+ ):
91
+ tasks.append(detector.detect(raw_bytes, mime_type))
92
+ active_detectors.append(detector)
93
+ else:
94
+ if parsed.parse_error:
95
+ logger.warning(
96
+ "Text extraction failed (%s): %s", parsed.mime_type, parsed.parse_error
97
+ )
98
+ text = parsed.text_content
99
+ if not text.strip():
100
+ logger.warning(
101
+ "No text content extracted from %s file; skipping text detectors.",
102
+ parsed.mime_type,
103
+ )
104
+ return parsed, []
105
+ if len(text) > _CONTENT_SIZE_LIMIT:
106
+ logger.warning(
107
+ f"Content size ({len(text)} bytes) exceeds limit "
108
+ f"({_CONTENT_SIZE_LIMIT} bytes); truncating."
109
+ )
110
+ text = text[:_CONTENT_SIZE_LIMIT]
111
+ for detector in detectors:
112
+ supported = detector.get_supported_content_types()
113
+ if "text/plain" in supported:
114
+ tasks.append(detector.detect(text, "text/plain"))
115
+ active_detectors.append(detector)
116
+
117
+ if not tasks:
118
+ return parsed, []
119
+
120
+ results = await asyncio.gather(*tasks, return_exceptions=True)
121
+
122
+ all_findings: list[DetectionResult] = []
123
+ detected_at = datetime.now(UTC)
124
+
125
+ for detector, result in zip(active_detectors, results, strict=False):
126
+ if isinstance(result, Exception):
127
+ logger.error(f"Detector {detector.__class__.__name__} failed: {result}")
128
+ continue
129
+ if isinstance(result, list):
130
+ for finding in result:
131
+ if isinstance(finding, DetectionResult):
132
+ all_findings.append(
133
+ finding.model_copy(
134
+ update={
135
+ "runner_id": "sandbox",
136
+ "detected_at": detected_at,
137
+ }
138
+ )
139
+ )
140
+
141
+ return parsed, all_findings
142
+
143
+ def run(self, file_path: Path) -> tuple[ParsedFile, list[DetectionResult]]:
144
+ """Synchronous wrapper around run_async."""
145
+ return asyncio.run(self.run_async(file_path))
@@ -0,0 +1,95 @@
1
+ import importlib
2
+ import inspect
3
+ import logging
4
+ import pkgutil
5
+ from typing import Any
6
+
7
+ from .base import BaseSource
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ _registry: dict[str, type[BaseSource]] = {}
12
+
13
+
14
+ def _discover_sources() -> None:
15
+ """
16
+ Automatically discover and register all BaseSource subclasses
17
+ in the subpackages of src.sources.
18
+ """
19
+ if _registry:
20
+ return
21
+
22
+ # Iterate over all subpackages in the current directory
23
+ for _loader, module_name, is_pkg in pkgutil.walk_packages(__path__, __name__ + "."):
24
+ if is_pkg:
25
+ continue
26
+
27
+ try:
28
+ module = importlib.import_module(module_name)
29
+ for attr_name in dir(module):
30
+ attr = getattr(module, attr_name)
31
+ # Check if it's a class, inherits from BaseSource, and is not BaseSource itself
32
+ if (
33
+ isinstance(attr, type)
34
+ and issubclass(attr, BaseSource)
35
+ and attr is not BaseSource
36
+ and not inspect.isabstract(attr)
37
+ ):
38
+ # We can use a class attribute for the type name,
39
+ # or derive it from the class name/module name.
40
+ # Let's assume the class might have a 'source_type' attribute,
41
+ # otherwise we fallback to a cleaned up class name.
42
+ source_type = getattr(attr, "source_type", None)
43
+ if not source_type:
44
+ # Fallback: WordPressSource -> wordpress
45
+ source_type = attr.__name__.replace("Source", "").lower()
46
+
47
+ if source_type in _registry:
48
+ logger.warning(
49
+ f"Duplicate source type '{source_type}' registered by {attr.__name__}"
50
+ )
51
+ else:
52
+ _registry[source_type] = attr
53
+ logger.debug(f"Registered source type '{source_type}' from {module_name}")
54
+ except Exception as e:
55
+ logger.error(f"Failed to import module {module_name}: {e}")
56
+
57
+
58
+ def get_source(
59
+ recipe: dict[str, Any],
60
+ source_id: str | None = None,
61
+ runner_id: str | None = None,
62
+ ) -> BaseSource:
63
+ """
64
+ Factory function to create a source instance from a recipe.
65
+
66
+ Args:
67
+ recipe: Source configuration
68
+ source_id: Optional source ID for asset attribution
69
+ runner_id: Optional runner ID for tracking
70
+ """
71
+ _discover_sources()
72
+
73
+ source_type = recipe.get("type", "").lower()
74
+ if not source_type:
75
+ raise ValueError("Recipe must have a 'type' field")
76
+
77
+ source_class = _registry.get(source_type)
78
+ if not source_class:
79
+ available = ", ".join(sorted(_registry.keys()))
80
+ raise ValueError(f"Source type '{source_type}' not found. Available sources: {available}")
81
+
82
+ # Prefer passing source_id + runner_id for full attribution.
83
+ try:
84
+ return source_class(recipe, source_id=source_id, runner_id=runner_id)
85
+ except TypeError:
86
+ try:
87
+ return source_class(recipe, runner_id=runner_id)
88
+ except TypeError:
89
+ return source_class(recipe)
90
+
91
+
92
+ def list_available_sources() -> list[str]:
93
+ """Return a list of all registered source types."""
94
+ _discover_sources()
95
+ return sorted(_registry.keys())
@@ -0,0 +1,389 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import random
6
+ import re
7
+ import time
8
+ from datetime import UTC, datetime
9
+ from typing import Any
10
+ from urllib.parse import urljoin, urlsplit, urlunsplit
11
+
12
+ import requests
13
+
14
+ from ..utils.hashing import normalize_http_url
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ URL_RE = re.compile(r"https?://[\w\-._~:/?#\[\]@!$&'()*+,;=%]+", re.IGNORECASE)
19
+
20
+ TABULAR_MIME_TYPES = {
21
+ "text/csv",
22
+ "text/tab-separated-values",
23
+ "application/vnd.ms-excel",
24
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
25
+ "application/parquet",
26
+ "application/vnd.apache.parquet",
27
+ }
28
+
29
+ TABULAR_FILE_EXTENSIONS = {
30
+ ".csv",
31
+ ".tsv",
32
+ ".xls",
33
+ ".xlsx",
34
+ ".parquet",
35
+ }
36
+
37
+
38
+ def normalize_atlassian_base_url(url: str, *, strip_wiki: bool = False) -> str:
39
+ normalized = normalize_http_url(url)
40
+ if not normalized:
41
+ raise ValueError(f"Invalid Atlassian base URL: {url}")
42
+
43
+ parsed = urlsplit(normalized)
44
+ path = parsed.path.rstrip("/")
45
+ if strip_wiki and path.endswith("/wiki"):
46
+ path = path[: -len("/wiki")]
47
+ return urlunsplit((parsed.scheme, parsed.netloc, path, "", ""))
48
+
49
+
50
+ def parse_datetime(value: str | None) -> datetime:
51
+ if not value:
52
+ return datetime.now(UTC)
53
+ normalized = value.replace("Z", "+00:00")
54
+ try:
55
+ parsed = datetime.fromisoformat(normalized)
56
+ except ValueError:
57
+ return datetime.now(UTC)
58
+ if parsed.tzinfo is None:
59
+ return parsed.replace(tzinfo=UTC)
60
+ return parsed
61
+
62
+
63
+ def dedupe_preserve_order(values: list[str]) -> list[str]:
64
+ seen: set[str] = set()
65
+ unique_values: list[str] = []
66
+ for value in values:
67
+ if value in seen:
68
+ continue
69
+ seen.add(value)
70
+ unique_values.append(value)
71
+ return unique_values
72
+
73
+
74
+ def deterministic_sample(items: list[Any], limit: int) -> list[Any]:
75
+ if limit >= len(items):
76
+ return items
77
+ generator = random.Random(0)
78
+ indexes = sorted(generator.sample(range(len(items)), k=limit))
79
+ return [items[i] for i in indexes]
80
+
81
+
82
+ def extract_urls_from_text(text: str) -> list[str]:
83
+ if not text:
84
+ return []
85
+ return dedupe_preserve_order([match.group(0) for match in URL_RE.finditer(text)])
86
+
87
+
88
+ def is_tabular_mime_type(mime_type: str) -> bool:
89
+ normalized = mime_type.split(";", 1)[0].strip().lower()
90
+ return normalized in TABULAR_MIME_TYPES
91
+
92
+
93
+ def is_tabular_filename(file_name: str) -> bool:
94
+ path = urlsplit(file_name).path.lower()
95
+ return any(path.endswith(extension) for extension in TABULAR_FILE_EXTENSIONS)
96
+
97
+
98
+ class AtlassianCloudClient:
99
+ def __init__(
100
+ self,
101
+ *,
102
+ base_url: str,
103
+ account_email: str,
104
+ api_token: str,
105
+ request_timeout_seconds: float = 30,
106
+ max_retries: int = 3,
107
+ rate_limit_delay_seconds: float = 0,
108
+ ) -> None:
109
+ self.base_url = base_url.rstrip("/")
110
+ self.request_timeout_seconds = max(float(request_timeout_seconds), 1.0)
111
+ self.max_retries = max(int(max_retries), 0)
112
+ self.rate_limit_delay_seconds = max(float(rate_limit_delay_seconds), 0.0)
113
+ self.session = requests.Session()
114
+ self.session.auth = (account_email, api_token)
115
+ self.session.headers.update(
116
+ {
117
+ "Accept": "application/json",
118
+ }
119
+ )
120
+
121
+ def close(self) -> None:
122
+ self.session.close()
123
+
124
+ def build_url(self, path_or_url: str) -> str:
125
+ if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
126
+ return path_or_url
127
+ if not path_or_url.startswith("/"):
128
+ path_or_url = f"/{path_or_url}"
129
+ return f"{self.base_url}{path_or_url}"
130
+
131
+ def _request(
132
+ self,
133
+ method: str,
134
+ path_or_url: str,
135
+ *,
136
+ params: dict[str, Any] | None = None,
137
+ headers: dict[str, str] | None = None,
138
+ stream: bool = False,
139
+ ) -> requests.Response:
140
+ url = self.build_url(path_or_url)
141
+ attempts = 0
142
+ max_attempts = self.max_retries + 1
143
+
144
+ while attempts < max_attempts:
145
+ attempts += 1
146
+ response = self.session.request(
147
+ method,
148
+ url,
149
+ params=params,
150
+ headers=headers,
151
+ timeout=self.request_timeout_seconds,
152
+ stream=stream,
153
+ )
154
+
155
+ if response.status_code == 429 and attempts < max_attempts:
156
+ retry_after_header = response.headers.get("Retry-After")
157
+ retry_after = 1
158
+ if retry_after_header:
159
+ try:
160
+ retry_after = max(int(float(retry_after_header)), 1)
161
+ except ValueError:
162
+ retry_after = 1
163
+ logger.warning("Atlassian rate limit hit for %s. Retrying in %ss", url, retry_after)
164
+ time.sleep(retry_after)
165
+ continue
166
+
167
+ if response.status_code >= 500 and attempts < max_attempts:
168
+ sleep_seconds = min(2 ** (attempts - 1), 8)
169
+ logger.warning(
170
+ "Atlassian server error %s for %s. Retrying in %ss",
171
+ response.status_code,
172
+ url,
173
+ sleep_seconds,
174
+ )
175
+ time.sleep(sleep_seconds)
176
+ continue
177
+
178
+ if self.rate_limit_delay_seconds > 0:
179
+ time.sleep(self.rate_limit_delay_seconds)
180
+ return response
181
+
182
+ return response
183
+
184
+ def get_json(
185
+ self,
186
+ path_or_url: str,
187
+ *,
188
+ params: dict[str, Any] | None = None,
189
+ ) -> dict[str, Any]:
190
+ response = self._request("GET", path_or_url, params=params)
191
+ response.raise_for_status()
192
+ try:
193
+ payload = response.json()
194
+ except ValueError as exc:
195
+ raise RuntimeError(
196
+ f"Atlassian API returned non-JSON response for {path_or_url}"
197
+ ) from exc
198
+ if not isinstance(payload, dict):
199
+ raise RuntimeError(f"Expected JSON object response for {path_or_url}")
200
+ return payload
201
+
202
+ def get_bytes(self, path_or_url: str) -> tuple[bytes, str]:
203
+ response = self._request("GET", path_or_url, stream=True)
204
+ response.raise_for_status()
205
+ chunks = []
206
+ for chunk in response.iter_content(chunk_size=8192):
207
+ if chunk:
208
+ chunks.append(chunk)
209
+ mime = response.headers.get("Content-Type", "").split(";")[0].strip().lower()
210
+ return b"".join(chunks), mime
211
+
212
+ def iter_confluence_results(
213
+ self,
214
+ path: str,
215
+ *,
216
+ params: dict[str, Any] | None = None,
217
+ ) -> list[dict[str, Any]]:
218
+ url = path
219
+ next_params = dict(params or {})
220
+ results: list[dict[str, Any]] = []
221
+ while True:
222
+ payload = self.get_json(url, params=next_params)
223
+ page_items = payload.get("results", [])
224
+ if isinstance(page_items, list):
225
+ for item in page_items:
226
+ if isinstance(item, dict):
227
+ results.append(item)
228
+
229
+ links = payload.get("_links", {})
230
+ next_link = links.get("next") if isinstance(links, dict) else None
231
+ if not isinstance(next_link, str) or not next_link:
232
+ break
233
+ url = urljoin(self.base_url + "/", next_link)
234
+ next_params = None
235
+ return results
236
+
237
+ def iter_jira_search_jql(
238
+ self,
239
+ *,
240
+ jql: str,
241
+ fields: list[str],
242
+ max_results: int = 100,
243
+ ) -> list[dict[str, Any]]:
244
+ results: list[dict[str, Any]] = []
245
+ next_page_token: str | None = None
246
+ while True:
247
+ params: dict[str, Any] = {
248
+ "jql": jql,
249
+ "maxResults": max_results,
250
+ "fields": ",".join(fields),
251
+ }
252
+ if next_page_token:
253
+ params["nextPageToken"] = next_page_token
254
+ payload = self.get_json("/rest/api/3/search/jql", params=params)
255
+ issues = payload.get("issues", [])
256
+ if isinstance(issues, list):
257
+ for issue in issues:
258
+ if isinstance(issue, dict):
259
+ results.append(issue)
260
+
261
+ if payload.get("isLast") is True:
262
+ break
263
+ if "nextPageToken" not in payload:
264
+ break
265
+ token = payload.get("nextPageToken")
266
+ if not token:
267
+ break
268
+ next_page_token = str(token)
269
+ return results
270
+
271
+ def iter_servicedesk_values(
272
+ self,
273
+ path: str,
274
+ *,
275
+ params: dict[str, Any] | None = None,
276
+ limit: int = 50,
277
+ ) -> list[dict[str, Any]]:
278
+ results: list[dict[str, Any]] = []
279
+ start = 0
280
+ base_params = dict(params or {})
281
+ page_limit = max(int(limit), 1)
282
+
283
+ while True:
284
+ request_params = dict(base_params)
285
+ request_params["start"] = start
286
+ request_params["limit"] = page_limit
287
+ payload = self.get_json(path, params=request_params)
288
+
289
+ values = payload.get("values", [])
290
+ if isinstance(values, list):
291
+ for item in values:
292
+ if isinstance(item, dict):
293
+ results.append(item)
294
+
295
+ is_last = payload.get("isLastPage")
296
+ if is_last is True:
297
+ break
298
+
299
+ size = payload.get("size")
300
+ try:
301
+ size_int = int(size)
302
+ except (TypeError, ValueError):
303
+ size_int = len(values) if isinstance(values, list) else 0
304
+ if size_int <= 0:
305
+ break
306
+ start += size_int
307
+ return results
308
+
309
+
310
+ def parse_atlassian_document(value: Any) -> tuple[str, list[str]]:
311
+ text_parts: list[str] = []
312
+ url_candidates: list[str] = []
313
+
314
+ def visit(node: Any) -> None:
315
+ if node is None:
316
+ return
317
+ if isinstance(node, str):
318
+ text_parts.append(node)
319
+ url_candidates.extend(extract_urls_from_text(node))
320
+ return
321
+ if isinstance(node, list):
322
+ for item in node:
323
+ visit(item)
324
+ return
325
+ if isinstance(node, dict):
326
+ attrs = node.get("attrs")
327
+ if isinstance(attrs, dict):
328
+ for key in ("url", "href"):
329
+ attr_value = attrs.get(key)
330
+ if isinstance(attr_value, str):
331
+ url_candidates.append(attr_value)
332
+ if "text" in node:
333
+ visit(node.get("text"))
334
+ if "content" in node:
335
+ visit(node.get("content"))
336
+ for key, val in node.items():
337
+ if key in {"attrs", "text", "content"}:
338
+ continue
339
+ if isinstance(val, (dict, list, str)):
340
+ visit(val)
341
+ return
342
+
343
+ visit(value)
344
+ return "\n".join(part for part in text_parts if part), dedupe_preserve_order(url_candidates)
345
+
346
+
347
+ def looks_like_file_asset(url: str) -> bool:
348
+ path = urlsplit(url).path.lower()
349
+ file_extensions = (
350
+ ".png",
351
+ ".jpg",
352
+ ".jpeg",
353
+ ".gif",
354
+ ".webp",
355
+ ".svg",
356
+ ".bmp",
357
+ ".ico",
358
+ ".mp4",
359
+ ".webm",
360
+ ".mov",
361
+ ".mkv",
362
+ ".avi",
363
+ ".mp3",
364
+ ".wav",
365
+ ".aac",
366
+ ".ogg",
367
+ ".pdf",
368
+ ".doc",
369
+ ".docx",
370
+ ".xls",
371
+ ".xlsx",
372
+ ".ppt",
373
+ ".pptx",
374
+ ".zip",
375
+ ".rar",
376
+ ".7z",
377
+ ".tar",
378
+ ".gz",
379
+ ".json",
380
+ ".xml",
381
+ ".txt",
382
+ ".csv",
383
+ ".md",
384
+ )
385
+ return path.endswith(file_extensions)
386
+
387
+
388
+ def json_dumps(data: dict[str, Any]) -> str:
389
+ return json.dumps(data, ensure_ascii=False, default=str)
@@ -0,0 +1,3 @@
1
+ from .source import AzureBlobStorageSource
2
+
3
+ __all__ = ["AzureBlobStorageSource"]