port-ocean 0.30.7__py3-none-any.whl → 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- integrations/_infra/Dockerfile.Deb +1 -1
- integrations/_infra/Dockerfile.local +2 -2
- integrations/_infra/entry_local.sh +2 -1
- integrations/_infra/hosts +4 -0
- port_ocean/config/settings.py +1 -2
- port_ocean/core/handlers/entity_processor/jq_entity_processor.py +14 -480
- port_ocean/core/integrations/mixins/utils.py +72 -302
- port_ocean/tests/core/handlers/entity_processor/test_jq_entity_processor.py +2 -933
- port_ocean/tests/core/integrations/mixins/test_integration_utils.py +0 -312
- {port_ocean-0.30.7.dist-info → port_ocean-0.31.0.dist-info}/METADATA +1 -1
- {port_ocean-0.30.7.dist-info → port_ocean-0.31.0.dist-info}/RECORD +14 -13
- {port_ocean-0.30.7.dist-info → port_ocean-0.31.0.dist-info}/LICENSE.md +0 -0
- {port_ocean-0.30.7.dist-info → port_ocean-0.31.0.dist-info}/WHEEL +0 -0
- {port_ocean-0.30.7.dist-info → port_ocean-0.31.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,22 +1,14 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import json
|
|
3
2
|
import multiprocessing
|
|
4
|
-
import os
|
|
5
3
|
import re
|
|
6
|
-
import shutil
|
|
7
|
-
import stat
|
|
8
|
-
import subprocess
|
|
9
|
-
import tempfile
|
|
10
4
|
from contextlib import contextmanager
|
|
11
|
-
from typing import Any, AsyncGenerator, Awaitable, Callable, Generator
|
|
12
|
-
import copy
|
|
5
|
+
from typing import Any, AsyncGenerator, Awaitable, Callable, Generator
|
|
13
6
|
|
|
14
7
|
import ijson
|
|
15
8
|
from loguru import logger
|
|
16
9
|
|
|
17
10
|
from port_ocean.clients.port.utils import _http_client as _port_http_client
|
|
18
11
|
from port_ocean.context.ocean import ocean
|
|
19
|
-
from port_ocean.core.handlers.entity_processor.jq_entity_processor import JQEntityProcessor
|
|
20
12
|
from port_ocean.core.ocean_types import (
|
|
21
13
|
ASYNC_GENERATOR_RESYNC_TYPE,
|
|
22
14
|
RAW_RESULT,
|
|
@@ -32,122 +24,6 @@ from port_ocean.exceptions.core import (
|
|
|
32
24
|
from port_ocean.helpers.metric.metric import MetricType, MetricPhase
|
|
33
25
|
from port_ocean.utils.async_http import _http_client
|
|
34
26
|
|
|
35
|
-
def _process_path_type_items(
|
|
36
|
-
result: RAW_RESULT, items_to_parse: str | None = None
|
|
37
|
-
) -> RAW_RESULT:
|
|
38
|
-
"""
|
|
39
|
-
Process items in the result array to check for "__type": "path" fields.
|
|
40
|
-
If found, read the file contents and load them into a "content" field.
|
|
41
|
-
Skip processing if we're on the items_to_parse branch.
|
|
42
|
-
"""
|
|
43
|
-
if not isinstance(result, list):
|
|
44
|
-
return result
|
|
45
|
-
|
|
46
|
-
# Skip processing if we're on the items_to_parse branch
|
|
47
|
-
if items_to_parse:
|
|
48
|
-
return result
|
|
49
|
-
|
|
50
|
-
processed_result = []
|
|
51
|
-
for item in result:
|
|
52
|
-
if isinstance(item, dict) and item.get("__type") == "path":
|
|
53
|
-
try:
|
|
54
|
-
# Read the file content and parse as JSON
|
|
55
|
-
file_path = item.get("file", {}).get("content", {}).get("path")
|
|
56
|
-
if file_path and os.path.exists(file_path):
|
|
57
|
-
with open(file_path, "r") as f:
|
|
58
|
-
content = json.loads(f.read())
|
|
59
|
-
# Create a copy of the item with the content field
|
|
60
|
-
processed_item = item.copy()
|
|
61
|
-
processed_item["file"]["content"] = content
|
|
62
|
-
processed_result.append(processed_item)
|
|
63
|
-
else:
|
|
64
|
-
# If file doesn't exist, keep the original item
|
|
65
|
-
processed_result.append(item)
|
|
66
|
-
except (json.JSONDecodeError, IOError, OSError) as e:
|
|
67
|
-
if isinstance(item, dict) and item.get("file") is not None:
|
|
68
|
-
content = item["file"].get("content") if isinstance(item["file"].get("content"), dict) else {}
|
|
69
|
-
data_path = content.get("path", None)
|
|
70
|
-
logger.warning(
|
|
71
|
-
f"Failed to read or parse file content for path {data_path}: {e}"
|
|
72
|
-
)
|
|
73
|
-
else:
|
|
74
|
-
logger.warning(
|
|
75
|
-
f"Failed to read or parse file content for unknown path: {e}. item: {json.dumps(item)}"
|
|
76
|
-
)
|
|
77
|
-
# Keep the original item if there's an error
|
|
78
|
-
processed_result.append(item)
|
|
79
|
-
else:
|
|
80
|
-
# Keep non-path type items as is
|
|
81
|
-
processed_result.append(item)
|
|
82
|
-
|
|
83
|
-
return processed_result
|
|
84
|
-
|
|
85
|
-
@contextmanager
|
|
86
|
-
def resync_error_handling() -> Generator[None, None, None]:
|
|
87
|
-
try:
|
|
88
|
-
yield
|
|
89
|
-
except RawObjectValidationException as error:
|
|
90
|
-
err_msg = f"Failed to validate raw data for returned data from resync function, error: {error}"
|
|
91
|
-
logger.exception(err_msg)
|
|
92
|
-
raise OceanAbortException(err_msg) from error
|
|
93
|
-
except StopAsyncIteration:
|
|
94
|
-
raise
|
|
95
|
-
except Exception as error:
|
|
96
|
-
err_msg = f"Failed to execute resync function, error: {error}"
|
|
97
|
-
logger.exception(err_msg)
|
|
98
|
-
raise OceanAbortException(err_msg) from error
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
async def resync_function_wrapper(
|
|
102
|
-
fn: Callable[[str], Awaitable[RAW_RESULT]], kind: str, items_to_parse: str | None = None
|
|
103
|
-
) -> RAW_RESULT:
|
|
104
|
-
with resync_error_handling():
|
|
105
|
-
results = await fn(kind)
|
|
106
|
-
validated_results = validate_result(results)
|
|
107
|
-
return _process_path_type_items(validated_results, items_to_parse)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
async def resync_generator_wrapper(
|
|
111
|
-
fn: Callable[[str], ASYNC_GENERATOR_RESYNC_TYPE], kind: str, items_to_parse_name: str, items_to_parse: str | None = None
|
|
112
|
-
) -> ASYNC_GENERATOR_RESYNC_TYPE:
|
|
113
|
-
generator = fn(kind)
|
|
114
|
-
errors = []
|
|
115
|
-
try:
|
|
116
|
-
while True:
|
|
117
|
-
try:
|
|
118
|
-
with resync_error_handling():
|
|
119
|
-
result = await anext(generator)
|
|
120
|
-
if not ocean.config.yield_items_to_parse:
|
|
121
|
-
validated_result = validate_result(result)
|
|
122
|
-
processed_result = _process_path_type_items(validated_result,items_to_parse)
|
|
123
|
-
yield processed_result
|
|
124
|
-
else:
|
|
125
|
-
if items_to_parse:
|
|
126
|
-
for data in result:
|
|
127
|
-
data_path: str | None = None
|
|
128
|
-
if isinstance(data, dict) and data.get("__type") == "path":
|
|
129
|
-
content = data.get("file", {}).get("content") if isinstance(data["file"].get("content"), dict) else {}
|
|
130
|
-
data_path = content.get("path", None)
|
|
131
|
-
bulks = get_items_to_parse_bulks(data, data_path, items_to_parse, items_to_parse_name, data.get("__base_jq", ".file.content"))
|
|
132
|
-
async for bulk in bulks:
|
|
133
|
-
yield bulk
|
|
134
|
-
else:
|
|
135
|
-
validated_result = validate_result(result)
|
|
136
|
-
processed_result = _process_path_type_items(validated_result, items_to_parse)
|
|
137
|
-
yield processed_result
|
|
138
|
-
except OceanAbortException as error:
|
|
139
|
-
errors.append(error)
|
|
140
|
-
ocean.metrics.inc_metric(
|
|
141
|
-
name=MetricType.OBJECT_COUNT_NAME,
|
|
142
|
-
labels=[ocean.metrics.current_resource_kind(), MetricPhase.EXTRACT , MetricPhase.ExtractResult.FAILED],
|
|
143
|
-
value=1
|
|
144
|
-
)
|
|
145
|
-
except StopAsyncIteration:
|
|
146
|
-
if errors:
|
|
147
|
-
raise ExceptionGroup(
|
|
148
|
-
"At least one of the resync generator iterations failed", errors
|
|
149
|
-
)
|
|
150
|
-
|
|
151
27
|
def extract_jq_deletion_path_revised(jq_expression: str) -> str | None:
|
|
152
28
|
"""
|
|
153
29
|
Revised function to extract a simple path suitable for del() by analyzing pipe segments.
|
|
@@ -204,113 +80,90 @@ def extract_jq_deletion_path_revised(jq_expression: str) -> str | None:
|
|
|
204
80
|
# Default case: No suitable path found after checking all segments
|
|
205
81
|
return None
|
|
206
82
|
|
|
207
|
-
|
|
208
|
-
def
|
|
209
|
-
kind: str, resync_event_mapping: dict[str | None, list[RESYNC_EVENT_LISTENER]]
|
|
210
|
-
) -> bool:
|
|
211
|
-
return bool(resync_event_mapping[kind] or resync_event_mapping[None])
|
|
212
|
-
|
|
213
|
-
def _validate_jq_expression(expression: str) -> None:
|
|
214
|
-
"""Validate jq expression to prevent command injection."""
|
|
83
|
+
@contextmanager
|
|
84
|
+
def resync_error_handling() -> Generator[None, None, None]:
|
|
215
85
|
try:
|
|
216
|
-
|
|
217
|
-
except
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
# Special handling for 'env' - block environment variable access
|
|
228
|
-
if re.search(r'(?<!\w)\$ENV(?:\.)?', expression):
|
|
229
|
-
raise ValueError("Environment variable access '$ENV.' found in jq expression")
|
|
230
|
-
if re.search(r'\benv\.', expression):
|
|
231
|
-
raise ValueError("Environment variable access 'env.' found in jq expression")
|
|
86
|
+
yield
|
|
87
|
+
except RawObjectValidationException as error:
|
|
88
|
+
err_msg = f"Failed to validate raw data for returned data from resync function, error: {error}"
|
|
89
|
+
logger.exception(err_msg)
|
|
90
|
+
raise OceanAbortException(err_msg) from error
|
|
91
|
+
except StopAsyncIteration:
|
|
92
|
+
raise
|
|
93
|
+
except Exception as error:
|
|
94
|
+
err_msg = f"Failed to execute resync function, error: {error}"
|
|
95
|
+
logger.exception(err_msg)
|
|
96
|
+
raise OceanAbortException(err_msg) from error
|
|
232
97
|
|
|
233
|
-
def _create_secure_temp_file(suffix: str = ".json") -> str:
|
|
234
|
-
"""Create a secure temporary file with restricted permissions."""
|
|
235
|
-
# Create temp directory if it doesn't exist
|
|
236
|
-
temp_dir = "/tmp/ocean"
|
|
237
|
-
os.makedirs(temp_dir, exist_ok=True)
|
|
238
98
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
return
|
|
245
|
-
finally:
|
|
246
|
-
os.close(fd)
|
|
99
|
+
async def resync_function_wrapper(
|
|
100
|
+
fn: Callable[[str], Awaitable[RAW_RESULT]], kind: str, items_to_parse: str | None = None
|
|
101
|
+
) -> RAW_RESULT:
|
|
102
|
+
with resync_error_handling():
|
|
103
|
+
results = await fn(kind)
|
|
104
|
+
return validate_result(results)
|
|
247
105
|
|
|
248
|
-
async def
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
106
|
+
async def handle_items_to_parse(result: RAW_RESULT, items_to_parse_name: str, items_to_parse: str | None = None) -> AsyncGenerator[list[dict[str, Any]], None]:
|
|
107
|
+
delete_target = extract_jq_deletion_path_revised(items_to_parse) or '.'
|
|
108
|
+
jq_expression = f". | del({delete_target})"
|
|
109
|
+
batch_size = ocean.config.yield_items_to_parse_batch_size
|
|
252
110
|
|
|
253
|
-
|
|
254
|
-
|
|
111
|
+
for item in result:
|
|
112
|
+
lean_item = await ocean.app.integration.entity_processor._search(item, jq_expression)
|
|
113
|
+
items_to_parse_data = await ocean.app.integration.entity_processor._search(item, items_to_parse)
|
|
114
|
+
if not isinstance(items_to_parse_data, list):
|
|
115
|
+
logger.warning(
|
|
116
|
+
f"Failed to parse items for JQ expression {items_to_parse}, Expected list but got {type(items_to_parse_data)}."
|
|
117
|
+
f" Skipping..."
|
|
118
|
+
)
|
|
119
|
+
continue
|
|
120
|
+
batch = []
|
|
121
|
+
while len(items_to_parse_data) > 0:
|
|
122
|
+
if (len(batch) >= batch_size):
|
|
123
|
+
yield batch
|
|
124
|
+
batch = []
|
|
125
|
+
merged_item = {**lean_item}
|
|
126
|
+
merged_item[items_to_parse_name] = items_to_parse_data.pop(0)
|
|
127
|
+
batch.append(merged_item)
|
|
128
|
+
if len(batch) > 0:
|
|
129
|
+
yield batch
|
|
255
130
|
|
|
131
|
+
async def resync_generator_wrapper(
|
|
132
|
+
fn: Callable[[str], ASYNC_GENERATOR_RESYNC_TYPE], kind: str, items_to_parse_name: str, items_to_parse: str | None = None
|
|
133
|
+
) -> ASYNC_GENERATOR_RESYNC_TYPE:
|
|
134
|
+
generator = fn(kind)
|
|
135
|
+
errors = []
|
|
256
136
|
try:
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
temp_data_path = _create_secure_temp_file("_input.json")
|
|
262
|
-
with open(temp_data_path, "w") as f:
|
|
263
|
-
f.write(raw_data_serialized)
|
|
264
|
-
data_path = temp_data_path
|
|
265
|
-
is_path_type = False
|
|
266
|
-
|
|
267
|
-
temp_output_path = _create_secure_temp_file("_parsed.json")
|
|
268
|
-
|
|
269
|
-
delete_target = extract_jq_deletion_path_revised(items_to_parse) or '.'
|
|
270
|
-
|
|
271
|
-
# Build jq expression safely
|
|
272
|
-
jq_expression = f""". as $all
|
|
273
|
-
| ($all | {items_to_parse}) as $items
|
|
274
|
-
| $items
|
|
275
|
-
| {_build_mapping_jq_expression(items_to_parse_name, base_jq, delete_target, is_path_type)}"""
|
|
137
|
+
while True:
|
|
138
|
+
try:
|
|
139
|
+
with resync_error_handling():
|
|
140
|
+
result = validate_result(await anext(generator))
|
|
276
141
|
|
|
277
|
-
|
|
142
|
+
if items_to_parse:
|
|
143
|
+
async for batch in handle_items_to_parse(result, items_to_parse_name, items_to_parse):
|
|
144
|
+
yield batch
|
|
145
|
+
else:
|
|
146
|
+
yield result
|
|
278
147
|
|
|
279
|
-
jq_path = shutil.which("jq") or "/bin/jq"
|
|
280
|
-
jq_args = [jq_path, jq_expression, data_path]
|
|
281
148
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
149
|
+
except OceanAbortException as error:
|
|
150
|
+
errors.append(error)
|
|
151
|
+
ocean.metrics.inc_metric(
|
|
152
|
+
name=MetricType.OBJECT_COUNT_NAME,
|
|
153
|
+
labels=[ocean.metrics.current_resource_kind(), MetricPhase.EXTRACT , MetricPhase.ExtractResult.FAILED],
|
|
154
|
+
value=1
|
|
155
|
+
)
|
|
156
|
+
except StopAsyncIteration:
|
|
157
|
+
if errors:
|
|
158
|
+
raise ExceptionGroup(
|
|
159
|
+
"At least one of the resync generator iterations failed", errors
|
|
289
160
|
)
|
|
290
161
|
|
|
291
|
-
if result.returncode != 0:
|
|
292
|
-
logger.error(f"Failed to parse items for JQ expression {items_to_parse}, error: {result.stderr}")
|
|
293
|
-
yield []
|
|
294
|
-
else:
|
|
295
|
-
with open(temp_output_path, "r") as f:
|
|
296
|
-
events_stream = get_events_as_a_stream(f, 'item', ocean.config.yield_items_to_parse_batch_size)
|
|
297
|
-
for items_bulk in events_stream:
|
|
298
|
-
yield items_bulk if not is_path_type else [merge_raw_data_to_item(item, raw_data) for item in items_bulk]
|
|
299
162
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
logger.error(f"Failed to parse items for JQ expression {items_to_parse}, error: {e}")
|
|
305
|
-
yield []
|
|
306
|
-
finally:
|
|
307
|
-
# Cleanup temporary files
|
|
308
|
-
for temp_path in [temp_data_path, temp_output_path]:
|
|
309
|
-
if temp_path and os.path.exists(temp_path):
|
|
310
|
-
try:
|
|
311
|
-
os.remove(temp_path)
|
|
312
|
-
except OSError as e:
|
|
313
|
-
logger.warning(f"Failed to cleanup temporary file {temp_path}: {e}")
|
|
163
|
+
def is_resource_supported(
|
|
164
|
+
kind: str, resync_event_mapping: dict[str | None, list[RESYNC_EVENT_LISTENER]]
|
|
165
|
+
) -> bool:
|
|
166
|
+
return bool(resync_event_mapping[kind] or resync_event_mapping[None])
|
|
314
167
|
|
|
315
168
|
def unsupported_kind_response(
|
|
316
169
|
kind: str, available_resync_kinds: list[str]
|
|
@@ -318,89 +171,6 @@ def unsupported_kind_response(
|
|
|
318
171
|
logger.error(f"Kind {kind} is not supported in this integration")
|
|
319
172
|
return [], [KindNotImplementedException(kind, available_resync_kinds)]
|
|
320
173
|
|
|
321
|
-
def _build_mapping_jq_expression(items_to_parse_name: str, base_jq: str, delete_target: str, is_path_type: bool = False) -> str:
|
|
322
|
-
if is_path_type:
|
|
323
|
-
return f"map({{{items_to_parse_name}: . }} | {base_jq} = (($all | del({delete_target})) // {{}}))"
|
|
324
|
-
return f"map(($all | del({delete_target})) + {{{items_to_parse_name}: . }})"
|
|
325
|
-
|
|
326
|
-
def merge_raw_data_to_item(item: dict[str, Any], raw_data: dict[str, Any]) -> dict[str, Any]:
|
|
327
|
-
return recursive_dict_merge(raw_data, item)
|
|
328
|
-
|
|
329
|
-
def recursive_dict_merge(d1: dict[Any, Any], d2: dict[Any, Any]) -> dict[Any, Any]:
|
|
330
|
-
"""
|
|
331
|
-
Recursively merges dict d2 into dict d1.
|
|
332
|
-
|
|
333
|
-
If a key exists in both dictionaries:
|
|
334
|
-
1. If the value in d2 is an empty dictionary (e.g., {}), it overwrites the value in d1.
|
|
335
|
-
2. If both values are non-empty dictionaries, they are merged recursively.
|
|
336
|
-
3. Otherwise, the value from d2 overwrites the value from d1.
|
|
337
|
-
|
|
338
|
-
The original dictionaries are not modified (d1 is copied).
|
|
339
|
-
|
|
340
|
-
Args:
|
|
341
|
-
d1: The base dictionary (will be copied and modified).
|
|
342
|
-
d2: The dictionary to merge into d1.
|
|
343
|
-
|
|
344
|
-
Returns:
|
|
345
|
-
The merged dictionary.
|
|
346
|
-
"""
|
|
347
|
-
# Start with a copy of d1 to ensure d1 is not mutated
|
|
348
|
-
merged_dict = copy.deepcopy(d1)
|
|
349
|
-
|
|
350
|
-
for key, value in d2.items():
|
|
351
|
-
# Condition to trigger recursive deep merge:
|
|
352
|
-
# 1. Key exists in merged_dict
|
|
353
|
-
# 2. Both values are dictionaries
|
|
354
|
-
# 3. The value from d2 is NOT an empty dictionary ({}).
|
|
355
|
-
# If d2's value is {}, we treat it as an explicit instruction to overwrite/clear.
|
|
356
|
-
is_deep_merge = (
|
|
357
|
-
key in merged_dict and
|
|
358
|
-
isinstance(merged_dict[key], dict) and
|
|
359
|
-
isinstance(value, dict) and
|
|
360
|
-
value != {}
|
|
361
|
-
)
|
|
362
|
-
|
|
363
|
-
if is_deep_merge:
|
|
364
|
-
# If both are dictionaries and d2 is not empty, recurse
|
|
365
|
-
merged_dict[key] = recursive_dict_merge(merged_dict[key], value)
|
|
366
|
-
else:
|
|
367
|
-
# Otherwise (new key, non-dict value, or explicit {} overwrite),
|
|
368
|
-
# overwrite the value from d1 with the value from d2
|
|
369
|
-
merged_dict[key] = value
|
|
370
|
-
|
|
371
|
-
return merged_dict
|
|
372
|
-
|
|
373
|
-
def get_events_as_a_stream(
|
|
374
|
-
stream: Any,
|
|
375
|
-
target_items: str = "item",
|
|
376
|
-
max_buffer_size_mb: int = 1
|
|
377
|
-
) -> Generator[list[dict[str, Any]], None, None]:
|
|
378
|
-
events = ijson.sendable_list()
|
|
379
|
-
coro = ijson.items_coro(events, target_items, use_float=True)
|
|
380
|
-
|
|
381
|
-
# Convert MB to bytes for the buffer size
|
|
382
|
-
buffer_size = max_buffer_size_mb * 1024 * 1024
|
|
383
|
-
|
|
384
|
-
# Read chunks from the stream until exhausted
|
|
385
|
-
while True:
|
|
386
|
-
chunk = stream.read(buffer_size)
|
|
387
|
-
if not chunk: # End of stream
|
|
388
|
-
break
|
|
389
|
-
|
|
390
|
-
# Convert string to bytes if necessary (for text mode files)
|
|
391
|
-
if isinstance(chunk, str):
|
|
392
|
-
chunk = chunk.encode('utf-8')
|
|
393
|
-
|
|
394
|
-
coro.send(chunk)
|
|
395
|
-
yield events
|
|
396
|
-
del events[:]
|
|
397
|
-
try:
|
|
398
|
-
coro.close()
|
|
399
|
-
finally:
|
|
400
|
-
if events:
|
|
401
|
-
yield events
|
|
402
|
-
events[:] = []
|
|
403
|
-
|
|
404
174
|
class ProcessWrapper(multiprocessing.Process):
|
|
405
175
|
def __init__(self, *args, **kwargs):
|
|
406
176
|
super().__init__(*args, **kwargs)
|