port-ocean 0.28.11__py3-none-any.whl → 0.28.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of port-ocean might be problematic. Click here for more details.

@@ -0,0 +1,137 @@
1
+ import re
2
+ from enum import Enum
3
+
4
+ # This file is used to classify the input that a jq expression to run on.
5
+ # It is used to determine if the jq expression can be executed without providing any JSON input (const expressions)
6
+ # or on a single item (in items to parse situation)
7
+ # or on all the data
8
+
9
+
10
+ class InputClassifyingResult(Enum):
11
+ NONE = 1
12
+ SINGLE = 2
13
+ ALL = 3
14
+
15
+
16
+ # Functions/filters that (even without ".") still require/assume input
17
+ _INPUT_DEPENDENT_FUNCS = r"""
18
+ \b(
19
+ map|select|reverse|sort|sort_by|unique|unique_by|group_by|flatten|transpose|
20
+ split|explode|join|add|length|has|in|index|indices|contains|
21
+ paths|leaf_paths|keys|keys_unsorted|values|to_entries|with_entries|from_entries|
22
+ del|delpaths|walk|reduce|foreach|input|inputs|limit|first|last|nth|
23
+ while|until|recurse|recurse_down|bsearch|combinations|permutations
24
+ )\b
25
+ """
26
+
27
+ _INPUT_DEPENDENT_RE = re.compile(_INPUT_DEPENDENT_FUNCS, re.VERBOSE)
28
+
29
+
30
+ # String literal handling (jq uses double quotes for strings)
31
+ _STRING_LITERAL_RE = re.compile(r'"(?:\\.|[^"\\])*"')
32
+ _STRING_ONLY_RE = re.compile(r'^\s*"(?:\\.|[^"\\])*"\s*$')
33
+ _NUMBER_ONLY_RE = re.compile(r"^\s*-?\d+(\.\d+)?\s*$")
34
+
35
+
36
+ def _mask_strings(expr: str) -> str:
37
+ """
38
+ Replace string literals with 'S' strings so '.' inside quotes don't count.
39
+ Example:
40
+ - '"this is a string"' ---> 'S'
41
+ - '"sting" + .field'. ---> 'S + .field'
42
+ """
43
+ return _STRING_LITERAL_RE.sub("S", expr)
44
+
45
+
46
+ def _mask_numbers(expr: str) -> str:
47
+ """
48
+ Replace number literals with 'N' so decimal points in numbers don't count as input references.
49
+ Example:
50
+ - '3.14' ---> 'N'
51
+ - '42 + 3.14' ---> 'N + N'
52
+ """
53
+ # Pattern to match numbers (integers and decimals, with optional sign)
54
+ number_pattern = re.compile(r"[-+]?\d+(?:\.\d+)?")
55
+ return number_pattern.sub("N", expr)
56
+
57
+
58
+ def can_expression_run_with_no_input(selector_query: str) -> bool:
59
+ """
60
+ Returns True if the jq expression can be executed without providing any JSON input.
61
+ Rules:
62
+ - Whitespace-only => No Input Required
63
+ - A pure string literal => No Input Required (even if it contains '.')
64
+ - After masking strings, if it contains '.' => Input Required
65
+ - Disallow known input-dependent functions (functions that require input)
66
+ - After masking strings, if it contains only operators and numbers and 'S' => No Input Required
67
+ - Allow null/true/false/number/range/empty, and array/object literals that
68
+ don't reference input (no '.' after masking strings) => No Input Required
69
+ Example:
70
+ - blueprint: '"newRelicService"' in mapping, selector_query param would be '"newRelicService"' => No Input Required
71
+ """
72
+ s = selector_query.strip()
73
+ if s == "":
74
+ return True # whitespace-only
75
+
76
+ # Pure string literal is nullary
77
+ if _STRING_ONLY_RE.match(s):
78
+ return True
79
+
80
+ # First mask strings, then mask numbers to prevent decimal points in numbers from being treated as input references
81
+ masked = _mask_strings(s).strip()
82
+ masked = _mask_numbers(masked).strip()
83
+
84
+ # If it contains any known input-dependent functions, don't shortcut
85
+ if _INPUT_DEPENDENT_RE.search(masked):
86
+ return False
87
+
88
+ # If it contains only operators and 'S'/'N', it can be executed with no input
89
+ # Example:
90
+ # - '"abc" + "def"' ---> 'S + S' => No Input Required
91
+ # - '3.14 + 2.5' ---> 'N + N' => No Input Required
92
+ # if re.fullmatch(
93
+ # r"(?:S|N)(?:\s*[+\-*/]\s*(?:S|N))*",
94
+ # masked,
95
+ # ):
96
+ # return True
97
+
98
+ if "." not in masked:
99
+ return True
100
+
101
+ return False
102
+
103
+
104
+ def _can_expression_run_on_single_item(expr: str, key: str) -> bool:
105
+ """
106
+ Detect `.key` outside of quotes, as a standalone path segment beginning
107
+ after a non-word boundary (start, space, |, (, [, {, , or :) and not part
108
+ of `.something.key`.
109
+ assuming key = 'item'
110
+ Examples:
111
+ - .item.yaeli => true
112
+ - map(.item.yaeli) => true
113
+ - .body.item => false
114
+ """
115
+ if not key:
116
+ return False
117
+
118
+ masked = _mask_strings(expr)
119
+ masked = _mask_numbers(masked)
120
+ pattern = re.compile(rf"(?<![A-Za-z0-9_])\.{re.escape(key)}(?![A-Za-z0-9_])")
121
+ return bool(pattern.search(masked))
122
+
123
+
124
+ def classify_input(
125
+ selector_query: str, single_item_key: str | None = None
126
+ ) -> InputClassifyingResult:
127
+ """
128
+ Returns the input evaluation result for the jq expression.
129
+ Conservative: requires NO '.' and must match a known nullary-safe pattern.
130
+ """
131
+ if can_expression_run_with_no_input(selector_query):
132
+ return InputClassifyingResult.NONE
133
+ if single_item_key and _can_expression_run_on_single_item(
134
+ selector_query, single_item_key
135
+ ):
136
+ return InputClassifyingResult.SINGLE
137
+ return InputClassifyingResult.ALL
@@ -41,7 +41,7 @@ class MappingsConfig(BaseModel):
41
41
  class PortResourceConfig(BaseModel):
42
42
  entity: MappingsConfig
43
43
  items_to_parse: str | None = Field(alias="itemsToParse")
44
- items_to_parse_name: str | None = Field(alias="itemsToParseName", default="item")
44
+ items_to_parse_name: str = Field(alias="itemsToParseName", default="item")
45
45
 
46
46
 
47
47
  class Selector(BaseModel):
@@ -117,7 +117,7 @@ class SyncRawMixin(HandlerMixin, EventsMixin):
117
117
  logger.info(
118
118
  f"Found async generator function for {resource_config.kind} name: {task.__qualname__}"
119
119
  )
120
- results.append(resync_generator_wrapper(task, resource_config.kind,resource_config.port.items_to_parse))
120
+ results.append(resync_generator_wrapper(task, resource_config.kind, resource_config.port.items_to_parse_name, resource_config.port.items_to_parse))
121
121
  else:
122
122
  logger.info(
123
123
  f"Found sync function for {resource_config.kind} name: {task.__qualname__}"
@@ -2,12 +2,11 @@ from contextlib import contextmanager
2
2
  from typing import Awaitable, Generator, Callable, cast
3
3
 
4
4
  from loguru import logger
5
-
6
5
  import asyncio
7
6
  import multiprocessing
8
-
7
+ import re
8
+ import json
9
9
  from port_ocean.core.handlers.entity_processor.jq_entity_processor import JQEntityProcessor
10
- from port_ocean.core.handlers.port_app_config.models import ResourceConfig
11
10
  from port_ocean.core.ocean_types import (
12
11
  ASYNC_GENERATOR_RESYNC_TYPE,
13
12
  RAW_RESULT,
@@ -20,11 +19,66 @@ from port_ocean.exceptions.core import (
20
19
  OceanAbortException,
21
20
  KindNotImplementedException,
22
21
  )
23
-
22
+ import os
24
23
  from port_ocean.utils.async_http import _http_client
25
24
  from port_ocean.clients.port.utils import _http_client as _port_http_client
26
25
  from port_ocean.helpers.metric.metric import MetricType, MetricPhase
27
26
  from port_ocean.context.ocean import ocean
27
+ import subprocess
28
+ import tempfile
29
+ import stat
30
+ import ijson
31
+ from typing import Any, AsyncGenerator
32
+
33
+ def _process_path_type_items(
34
+ result: RAW_RESULT, items_to_parse: str | None = None
35
+ ) -> RAW_RESULT:
36
+ """
37
+ Process items in the result array to check for "__type": "path" fields.
38
+ If found, read the file contents and load them into a "content" field.
39
+ Skip processing if we're on the items_to_parse branch.
40
+ """
41
+ if not isinstance(result, list):
42
+ return result
43
+
44
+ # Skip processing if we're on the items_to_parse branch
45
+ if items_to_parse:
46
+ return result
47
+
48
+ processed_result = []
49
+ for item in result:
50
+ if isinstance(item, dict) and item.get("__type") == "path":
51
+ try:
52
+ # Read the file content and parse as JSON
53
+ file_path = item.get("file", {}).get("content", {}).get("path")
54
+ if file_path and os.path.exists(file_path):
55
+ with open(file_path, "r") as f:
56
+ content = json.loads(f.read())
57
+ # Create a copy of the item with the content field
58
+ processed_item = item.copy()
59
+ processed_item["content"] = content
60
+ processed_result.append(processed_item)
61
+ else:
62
+ # If file doesn't exist, keep the original item
63
+ processed_result.append(item)
64
+ except (json.JSONDecodeError, IOError, OSError) as e:
65
+ if isinstance(item, dict) and item.get("file") is not None:
66
+ content = item["file"].get("content") if isinstance(item["file"].get("content"), dict) else {}
67
+ data_path = content.get("path", None)
68
+ logger.warning(
69
+ f"Failed to read or parse file content for path {data_path}: {e}"
70
+ )
71
+ else:
72
+ logger.warning(
73
+ f"Failed to read or parse file content for unknown path: {e}. item: {json.dumps(item)}"
74
+ )
75
+ # Keep the original item if there's an error
76
+ processed_result.append(item)
77
+ else:
78
+ # Keep non-path type items as is
79
+ processed_result.append(item)
80
+
81
+ return processed_result
28
82
 
29
83
  @contextmanager
30
84
  def resync_error_handling() -> Generator[None, None, None]:
@@ -47,11 +101,12 @@ async def resync_function_wrapper(
47
101
  ) -> RAW_RESULT:
48
102
  with resync_error_handling():
49
103
  results = await fn(kind)
50
- return validate_result(results)
104
+ validated_results = validate_result(results)
105
+ return _process_path_type_items(validated_results)
51
106
 
52
107
 
53
108
  async def resync_generator_wrapper(
54
- fn: Callable[[str], ASYNC_GENERATOR_RESYNC_TYPE], kind: str, items_to_parse: str | None = None
109
+ fn: Callable[[str], ASYNC_GENERATOR_RESYNC_TYPE], kind: str, items_to_parse_name: str, items_to_parse: str | None = None
55
110
  ) -> ASYNC_GENERATOR_RESYNC_TYPE:
56
111
  generator = fn(kind)
57
112
  errors = []
@@ -61,27 +116,23 @@ async def resync_generator_wrapper(
61
116
  with resync_error_handling():
62
117
  result = await anext(generator)
63
118
  if not ocean.config.yield_items_to_parse:
64
- yield validate_result(result)
119
+ validated_result = validate_result(result)
120
+ processed_result = _process_path_type_items(validated_result)
121
+ yield processed_result
65
122
  else:
66
- batch_size = ocean.config.yield_items_to_parse_batch_size
67
123
  if items_to_parse:
68
124
  for data in result:
69
- items = await cast(JQEntityProcessor, ocean.app.integration.entity_processor)._search(data, items_to_parse)
70
- if not isinstance(items, list):
71
- logger.warning(
72
- f"Failed to parse items for JQ expression {items_to_parse}, Expected list but got {type(items)}."
73
- f" Skipping..."
74
- )
75
- yield []
76
- raw_data = [{"item": item, **data} for item in items]
77
- while True:
78
- raw_data_batch = raw_data[:batch_size]
79
- yield raw_data_batch
80
- raw_data = raw_data[batch_size:]
81
- if len(raw_data) == 0:
82
- break
125
+ data_path: str | None = None
126
+ if isinstance(data, dict) and data.get("file") is not None:
127
+ content = data["file"].get("content") if isinstance(data["file"].get("content"), dict) else {}
128
+ data_path = content.get("path", None)
129
+ bulks = get_items_to_parse_bulks(data, data_path, items_to_parse, items_to_parse_name, data.get("__base_jq", ".file.content"))
130
+ async for bulk in bulks:
131
+ yield bulk
83
132
  else:
84
- yield validate_result(result)
133
+ validated_result = validate_result(result)
134
+ processed_result = _process_path_type_items(validated_result, items_to_parse)
135
+ yield processed_result
85
136
  except OceanAbortException as error:
86
137
  errors.append(error)
87
138
  ocean.metrics.inc_metric(
@@ -101,6 +152,104 @@ def is_resource_supported(
101
152
  ) -> bool:
102
153
  return bool(resync_event_mapping[kind] or resync_event_mapping[None])
103
154
 
155
+ def _validate_jq_expression(expression: str) -> None:
156
+ """Validate jq expression to prevent command injection."""
157
+ try:
158
+ _ = cast(JQEntityProcessor, ocean.app.integration.entity_processor)._compile(expression)
159
+ except Exception as e:
160
+ raise ValueError(f"Invalid jq expression: {e}") from e
161
+ # Basic validation - reject expressions that could be dangerous
162
+ # Check for dangerous patterns (include, import, module)
163
+ dangerous_patterns = ['include', 'import', 'module', 'env', 'debug']
164
+ for pattern in dangerous_patterns:
165
+ # Use word boundary regex to match only complete words, not substrings
166
+ if re.search(rf'\b{re.escape(pattern)}\b', expression):
167
+ raise ValueError(f"Potentially dangerous pattern '{pattern}' found in jq expression")
168
+
169
+ # Special handling for 'env' - block environment variable access
170
+ if re.search(r'(?<!\w)\$ENV(?:\.)?', expression):
171
+ raise ValueError("Environment variable access '$ENV.' found in jq expression")
172
+ if re.search(r'\benv\.', expression):
173
+ raise ValueError("Environment variable access 'env.' found in jq expression")
174
+
175
+ def _create_secure_temp_file(suffix: str = ".json") -> str:
176
+ """Create a secure temporary file with restricted permissions."""
177
+ # Create temp directory if it doesn't exist
178
+ temp_dir = "/tmp/ocean"
179
+ os.makedirs(temp_dir, exist_ok=True)
180
+
181
+ # Create temporary file with secure permissions
182
+ fd, temp_path = tempfile.mkstemp(suffix=suffix, dir=temp_dir)
183
+ try:
184
+ # Set restrictive permissions (owner read/write only)
185
+ os.chmod(temp_path, stat.S_IRUSR | stat.S_IWUSR)
186
+ return temp_path
187
+ finally:
188
+ os.close(fd)
189
+
190
+ async def get_items_to_parse_bulks(raw_data: dict[Any, Any], data_path: str, items_to_parse: str, items_to_parse_name: str, base_jq: str) -> AsyncGenerator[list[dict[str, Any]], None]:
191
+ # Validate inputs to prevent command injection
192
+ _validate_jq_expression(items_to_parse)
193
+ items_to_parse = items_to_parse.replace(base_jq, ".") if data_path else items_to_parse
194
+
195
+ temp_data_path = None
196
+ temp_output_path = None
197
+
198
+ try:
199
+ # Create secure temporary files
200
+ if not data_path:
201
+ raw_data_serialized = json.dumps(raw_data)
202
+ temp_data_path = _create_secure_temp_file("_input.json")
203
+ with open(temp_data_path, "w") as f:
204
+ f.write(raw_data_serialized)
205
+ data_path = temp_data_path
206
+
207
+ temp_output_path = _create_secure_temp_file("_parsed.json")
208
+
209
+ delete_target = items_to_parse.split('|', 1)[0].strip() if not items_to_parse.startswith('map(') else '.'
210
+ base_jq_object_string = await _build_base_jq_object_string(raw_data, base_jq, delete_target)
211
+
212
+ # Build jq expression safely
213
+ jq_expression = f""". as $all
214
+ | ($all | {items_to_parse}) as $items
215
+ | $items
216
+ | map({{{items_to_parse_name}: ., {base_jq_object_string}}})"""
217
+
218
+ # Use subprocess with list arguments instead of shell=True
219
+ jq_args = ["/bin/jq", jq_expression, data_path]
220
+
221
+ with open(temp_output_path, "w") as output_file:
222
+ result = subprocess.run(
223
+ jq_args,
224
+ stdout=output_file,
225
+ stderr=subprocess.PIPE,
226
+ text=True,
227
+ check=False # Don't raise exception, handle errors manually
228
+ )
229
+
230
+ if result.returncode != 0:
231
+ logger.error(f"Failed to parse items for JQ expression {items_to_parse}, error: {result.stderr}")
232
+ yield []
233
+ else:
234
+ with open(temp_output_path, "r") as f:
235
+ events_stream = get_events_as_a_stream(f, 'item', ocean.config.yield_items_to_parse_batch_size)
236
+ for items_bulk in events_stream:
237
+ yield items_bulk
238
+
239
+ except ValueError as e:
240
+ logger.error(f"Invalid jq expression: {e}")
241
+ yield []
242
+ except Exception as e:
243
+ logger.error(f"Failed to parse items for JQ expression {items_to_parse}, error: {e}")
244
+ yield []
245
+ finally:
246
+ # Cleanup temporary files
247
+ for temp_path in [temp_data_path, temp_output_path]:
248
+ if temp_path and os.path.exists(temp_path):
249
+ try:
250
+ os.remove(temp_path)
251
+ except OSError as e:
252
+ logger.warning(f"Failed to cleanup temporary file {temp_path}: {e}")
104
253
 
105
254
  def unsupported_kind_response(
106
255
  kind: str, available_resync_kinds: list[str]
@@ -108,6 +257,44 @@ def unsupported_kind_response(
108
257
  logger.error(f"Kind {kind} is not supported in this integration")
109
258
  return [], [KindNotImplementedException(kind, available_resync_kinds)]
110
259
 
260
+ async def _build_base_jq_object_string(raw_data: dict[Any, Any], base_jq: str, delete_target: str) -> str:
261
+ base_jq_object_before_parsing = await cast(JQEntityProcessor, ocean.app.integration.entity_processor)._search(raw_data, f"{base_jq} = {json.dumps("__all")}")
262
+ base_jq_object_before_parsing_serialized = json.dumps(base_jq_object_before_parsing)
263
+ base_jq_object_before_parsing_serialized = base_jq_object_before_parsing_serialized[1:-1] if len(base_jq_object_before_parsing_serialized) >= 2 else base_jq_object_before_parsing_serialized
264
+ base_jq_object_before_parsing_serialized = base_jq_object_before_parsing_serialized.replace("\"__all\"", f"(($all | del({delete_target})) // {{}})")
265
+ return base_jq_object_before_parsing_serialized
266
+
267
+
268
+ def get_events_as_a_stream(
269
+ stream: Any,
270
+ target_items: str = "item",
271
+ max_buffer_size_mb: int = 1
272
+ ) -> Generator[list[dict[str, Any]], None, None]:
273
+ events = ijson.sendable_list()
274
+ coro = ijson.items_coro(events, target_items)
275
+
276
+ # Convert MB to bytes for the buffer size
277
+ buffer_size = max_buffer_size_mb * 1024 * 1024
278
+
279
+ # Read chunks from the stream until exhausted
280
+ while True:
281
+ chunk = stream.read(buffer_size)
282
+ if not chunk: # End of stream
283
+ break
284
+
285
+ # Convert string to bytes if necessary (for text mode files)
286
+ if isinstance(chunk, str):
287
+ chunk = chunk.encode('utf-8')
288
+
289
+ coro.send(chunk)
290
+ yield events
291
+ del events[:]
292
+ try:
293
+ coro.close()
294
+ finally:
295
+ if events:
296
+ yield events
297
+ events[:] = []
111
298
 
112
299
  class ProcessWrapper(multiprocessing.Process):
113
300
  def __init__(self, *args, **kwargs):
@@ -134,3 +321,34 @@ def clear_http_client_context() -> None:
134
321
  _port_http_client.pop()
135
322
  except (RuntimeError, AttributeError):
136
323
  pass
324
+
325
+ class _AiterReader:
326
+ """
327
+ Wraps an iterable of byte chunks (e.g., response.iter_bytes())
328
+ and exposes a .read(n) method that ijson expects.
329
+ """
330
+ def __init__(self, iterable):
331
+ self._iter = iter(iterable)
332
+ self._buf = bytearray()
333
+ self._eof = False
334
+
335
+ def read(self, n=-1):
336
+ # If n < 0, return everything until EOF
337
+ if n is None or n < 0:
338
+ chunks = [bytes(self._buf)]
339
+ self._buf.clear()
340
+ chunks.extend(self._iter) # drain the iterator
341
+ return b"".join(chunks)
342
+
343
+ # Fill buffer until we have n bytes or hit EOF
344
+ while len(self._buf) < n and not self._eof:
345
+ try:
346
+ self._buf.extend(next(self._iter))
347
+ except StopIteration:
348
+ self._eof = True
349
+ break
350
+
351
+ # Serve up to n bytes
352
+ out = bytes(self._buf[:n])
353
+ del self._buf[:n]
354
+ return out