port-ocean 0.28.5__py3-none-any.whl → 0.29.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. integrations/_infra/Dockerfile.Deb +1 -0
  2. integrations/_infra/Dockerfile.local +1 -0
  3. port_ocean/clients/port/authentication.py +19 -0
  4. port_ocean/clients/port/client.py +3 -0
  5. port_ocean/clients/port/mixins/actions.py +93 -0
  6. port_ocean/clients/port/mixins/blueprints.py +0 -12
  7. port_ocean/clients/port/mixins/entities.py +79 -44
  8. port_ocean/clients/port/mixins/integrations.py +7 -2
  9. port_ocean/config/settings.py +35 -3
  10. port_ocean/context/ocean.py +7 -5
  11. port_ocean/core/defaults/initialize.py +12 -5
  12. port_ocean/core/event_listener/__init__.py +7 -0
  13. port_ocean/core/event_listener/actions_only.py +42 -0
  14. port_ocean/core/event_listener/base.py +4 -1
  15. port_ocean/core/event_listener/factory.py +18 -9
  16. port_ocean/core/event_listener/http.py +4 -3
  17. port_ocean/core/event_listener/kafka.py +3 -2
  18. port_ocean/core/event_listener/once.py +5 -2
  19. port_ocean/core/event_listener/polling.py +4 -3
  20. port_ocean/core/event_listener/webhooks_only.py +3 -2
  21. port_ocean/core/handlers/actions/__init__.py +7 -0
  22. port_ocean/core/handlers/actions/abstract_executor.py +150 -0
  23. port_ocean/core/handlers/actions/execution_manager.py +434 -0
  24. port_ocean/core/handlers/entity_processor/jq_entity_processor.py +479 -17
  25. port_ocean/core/handlers/entity_processor/jq_input_evaluator.py +137 -0
  26. port_ocean/core/handlers/port_app_config/models.py +4 -2
  27. port_ocean/core/handlers/webhook/abstract_webhook_processor.py +16 -0
  28. port_ocean/core/handlers/webhook/processor_manager.py +30 -12
  29. port_ocean/core/integrations/mixins/sync_raw.py +4 -4
  30. port_ocean/core/integrations/mixins/utils.py +250 -29
  31. port_ocean/core/models.py +35 -2
  32. port_ocean/core/utils/utils.py +16 -5
  33. port_ocean/exceptions/execution_manager.py +22 -0
  34. port_ocean/helpers/retry.py +4 -40
  35. port_ocean/log/logger_setup.py +2 -2
  36. port_ocean/ocean.py +30 -4
  37. port_ocean/tests/clients/port/mixins/test_entities.py +71 -5
  38. port_ocean/tests/core/event_listener/test_kafka.py +14 -7
  39. port_ocean/tests/core/handlers/actions/test_execution_manager.py +837 -0
  40. port_ocean/tests/core/handlers/entity_processor/test_jq_entity_processor.py +932 -1
  41. port_ocean/tests/core/handlers/entity_processor/test_jq_input_evaluator.py +932 -0
  42. port_ocean/tests/core/handlers/webhook/test_processor_manager.py +3 -1
  43. port_ocean/tests/core/utils/test_get_port_diff.py +164 -0
  44. port_ocean/tests/helpers/test_retry.py +241 -1
  45. port_ocean/tests/utils/test_cache.py +240 -0
  46. port_ocean/utils/cache.py +45 -9
  47. {port_ocean-0.28.5.dist-info → port_ocean-0.29.0.dist-info}/METADATA +2 -1
  48. {port_ocean-0.28.5.dist-info → port_ocean-0.29.0.dist-info}/RECORD +51 -41
  49. {port_ocean-0.28.5.dist-info → port_ocean-0.29.0.dist-info}/LICENSE.md +0 -0
  50. {port_ocean-0.28.5.dist-info → port_ocean-0.29.0.dist-info}/WHEEL +0 -0
  51. {port_ocean-0.28.5.dist-info → port_ocean-0.29.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,137 @@
1
+ import re
2
+ from enum import Enum
3
+
4
+ # This file is used to classify the input that a jq expression to run on.
5
+ # It is used to determine if the jq expression can be executed without providing any JSON input (const expressions)
6
+ # or on a single item (in items to parse situation)
7
+ # or on all the data
8
+
9
+
10
+ class InputClassifyingResult(Enum):
11
+ NONE = 1
12
+ SINGLE = 2
13
+ ALL = 3
14
+
15
+
16
+ # Functions/filters that (even without ".") still require/assume input
17
+ _INPUT_DEPENDENT_FUNCS = r"""
18
+ \b(
19
+ map|select|reverse|sort|sort_by|unique|unique_by|group_by|flatten|transpose|
20
+ split|explode|join|add|length|has|in|index|indices|contains|
21
+ paths|leaf_paths|keys|keys_unsorted|values|to_entries|with_entries|from_entries|
22
+ del|delpaths|walk|reduce|foreach|input|inputs|limit|first|last|nth|
23
+ while|until|recurse|recurse_down|bsearch|combinations|permutations
24
+ )\b
25
+ """
26
+
27
+ _INPUT_DEPENDENT_RE = re.compile(_INPUT_DEPENDENT_FUNCS, re.VERBOSE)
28
+
29
+
30
+ # String literal handling (jq uses double quotes for strings)
31
+ _STRING_LITERAL_RE = re.compile(r'"(?:\\.|[^"\\])*"')
32
+ _STRING_ONLY_RE = re.compile(r'^\s*"(?:\\.|[^"\\])*"\s*$')
33
+ _NUMBER_ONLY_RE = re.compile(r"^\s*-?\d+(\.\d+)?\s*$")
34
+
35
+
36
+ def _mask_strings(expr: str) -> str:
37
+ """
38
+ Replace string literals with 'S' strings so '.' inside quotes don't count.
39
+ Example:
40
+ - '"this is a string"' ---> 'S'
41
+ - '"sting" + .field'. ---> 'S + .field'
42
+ """
43
+ return _STRING_LITERAL_RE.sub("S", expr)
44
+
45
+
46
+ def _mask_numbers(expr: str) -> str:
47
+ """
48
+ Replace number literals with 'N' so decimal points in numbers don't count as input references.
49
+ Example:
50
+ - '3.14' ---> 'N'
51
+ - '42 + 3.14' ---> 'N + N'
52
+ """
53
+ # Pattern to match numbers (integers and decimals, with optional sign)
54
+ number_pattern = re.compile(r"[-+]?\d+(?:\.\d+)?")
55
+ return number_pattern.sub("N", expr)
56
+
57
+
58
+ def can_expression_run_with_no_input(selector_query: str) -> bool:
59
+ """
60
+ Returns True if the jq expression can be executed without providing any JSON input.
61
+ Rules:
62
+ - Whitespace-only => No Input Required
63
+ - A pure string literal => No Input Required (even if it contains '.')
64
+ - After masking strings, if it contains '.' => Input Required
65
+ - Disallow known input-dependent functions (functions that require input)
66
+ - After masking strings, if it contains only operators and numbers and 'S' => No Input Required
67
+ - Allow null/true/false/number/range/empty, and array/object literals that
68
+ don't reference input (no '.' after masking strings) => No Input Required
69
+ Example:
70
+ - blueprint: '"newRelicService"' in mapping, selector_query param would be '"newRelicService"' => No Input Required
71
+ """
72
+ s = selector_query.strip()
73
+ if s == "":
74
+ return True # whitespace-only
75
+
76
+ # Pure string literal is nullary
77
+ if _STRING_ONLY_RE.match(s):
78
+ return True
79
+
80
+ # First mask strings, then mask numbers to prevent decimal points in numbers from being treated as input references
81
+ masked = _mask_strings(s).strip()
82
+ masked = _mask_numbers(masked).strip()
83
+
84
+ # If it contains any known input-dependent functions, don't shortcut
85
+ if _INPUT_DEPENDENT_RE.search(masked):
86
+ return False
87
+
88
+ # If it contains only operators and 'S'/'N', it can be executed with no input
89
+ # Example:
90
+ # - '"abc" + "def"' ---> 'S + S' => No Input Required
91
+ # - '3.14 + 2.5' ---> 'N + N' => No Input Required
92
+ # if re.fullmatch(
93
+ # r"(?:S|N)(?:\s*[+\-*/]\s*(?:S|N))*",
94
+ # masked,
95
+ # ):
96
+ # return True
97
+
98
+ if "." not in masked:
99
+ return True
100
+
101
+ return False
102
+
103
+
104
+ def _can_expression_run_on_single_item(expr: str, key: str) -> bool:
105
+ """
106
+ Detect `.key` outside of quotes, as a standalone path segment beginning
107
+ after a non-word boundary (start, space, |, (, [, {, , or :) and not part
108
+ of `.something.key`.
109
+ assuming key = 'item'
110
+ Examples:
111
+ - .item.yaeli => true
112
+ - map(.item.yaeli) => true
113
+ - .body.item => false
114
+ """
115
+ if not key:
116
+ return False
117
+
118
+ masked = _mask_strings(expr)
119
+ masked = _mask_numbers(masked)
120
+ pattern = re.compile(rf"(?<![A-Za-z0-9_])\.{re.escape(key)}(?![A-Za-z0-9_])")
121
+ return bool(pattern.search(masked))
122
+
123
+
124
+ def classify_input(
125
+ selector_query: str, single_item_key: str | None = None
126
+ ) -> InputClassifyingResult:
127
+ """
128
+ Returns the input evaluation result for the jq expression.
129
+ Conservative: requires NO '.' and must match a known nullary-safe pattern.
130
+ """
131
+ if can_expression_run_with_no_input(selector_query):
132
+ return InputClassifyingResult.NONE
133
+ if single_item_key and _can_expression_run_on_single_item(
134
+ selector_query, single_item_key
135
+ ):
136
+ return InputClassifyingResult.SINGLE
137
+ return InputClassifyingResult.ALL
@@ -29,7 +29,9 @@ class EntityMapping(BaseModel):
29
29
 
30
30
  @property
31
31
  def is_using_search_identifier(self) -> bool:
32
- return isinstance(self.identifier, dict)
32
+ return isinstance(self.identifier, dict) or isinstance(
33
+ self.identifier, IngestSearchQuery
34
+ )
33
35
 
34
36
 
35
37
  class MappingsConfig(BaseModel):
@@ -39,7 +41,7 @@ class MappingsConfig(BaseModel):
39
41
  class PortResourceConfig(BaseModel):
40
42
  entity: MappingsConfig
41
43
  items_to_parse: str | None = Field(alias="itemsToParse")
42
- items_to_parse_name: str | None = Field(alias="itemsToParseName", default="item")
44
+ items_to_parse_name: str = Field(alias="itemsToParseName", default="item")
43
45
 
44
46
 
45
47
  class Selector(BaseModel):
@@ -1,4 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
+ from enum import StrEnum
2
3
  from loguru import logger
3
4
 
4
5
  from port_ocean.core.handlers.port_app_config.models import ResourceConfig
@@ -12,6 +13,17 @@ from .webhook_event import (
12
13
  )
13
14
 
14
15
 
16
+ class WebhookProcessorType(StrEnum):
17
+ """Type of webhook processor"""
18
+
19
+ # For action-related webhooks
20
+ # (e.g. update finished action using the workflow runs webhook)
21
+ ACTION = "action"
22
+ # For regular webhooks
23
+ # (e.g. repository events that should be reflected as Entities in Port)
24
+ WEBHOOK = "webhook"
25
+
26
+
15
27
  class AbstractWebhookProcessor(ABC):
16
28
  """
17
29
  Abstract base class for webhook processors
@@ -47,6 +59,10 @@ class AbstractWebhookProcessor(ABC):
47
59
  self.event = event
48
60
  self.retry_count = 0
49
61
 
62
+ @classmethod
63
+ def get_processor_type(cls) -> WebhookProcessorType:
64
+ return WebhookProcessorType.WEBHOOK
65
+
50
66
  async def on_error(self, error: Exception) -> None:
51
67
  """Hook to handle errors during processing. Override if needed"""
52
68
  delay = self.calculate_retry_delay()
@@ -11,11 +11,17 @@ from port_ocean.core.handlers.queue.abstract_queue import AbstractQueue
11
11
  from port_ocean.core.integrations.mixins.events import EventsMixin
12
12
  from port_ocean.core.integrations.mixins.live_events import LiveEventsMixin
13
13
  from port_ocean.exceptions.webhook_processor import WebhookEventNotSupportedError
14
- from .webhook_event import WebhookEvent, WebhookEventRawResults, LiveEventTimestamp
14
+ from port_ocean.core.handlers.webhook.webhook_event import (
15
+ WebhookEvent,
16
+ WebhookEventRawResults,
17
+ LiveEventTimestamp,
18
+ )
15
19
  from port_ocean.context.event import event
16
20
 
17
-
18
- from .abstract_webhook_processor import AbstractWebhookProcessor
21
+ from port_ocean.core.handlers.webhook.abstract_webhook_processor import (
22
+ AbstractWebhookProcessor,
23
+ WebhookProcessorType,
24
+ )
19
25
  from port_ocean.utils.signal import SignalHandler
20
26
  from port_ocean.core.handlers.queue import LocalQueue
21
27
 
@@ -56,7 +62,7 @@ class LiveEventsProcessorManager(LiveEventsMixin, EventsMixin):
56
62
  while True:
57
63
  event = None
58
64
  matching_processors: List[
59
- Tuple[ResourceConfig, AbstractWebhookProcessor]
65
+ Tuple[ResourceConfig | None, AbstractWebhookProcessor]
60
66
  ] = []
61
67
  try:
62
68
  event = await queue.get()
@@ -133,16 +139,22 @@ class LiveEventsProcessorManager(LiveEventsMixin, EventsMixin):
133
139
 
134
140
  async def _extract_matching_processors(
135
141
  self, webhook_event: WebhookEvent, path: str
136
- ) -> list[tuple[ResourceConfig, AbstractWebhookProcessor]]:
142
+ ) -> list[tuple[ResourceConfig | None, AbstractWebhookProcessor]]:
137
143
  """Find and extract the matching processor for an event"""
138
144
 
139
- created_processors: list[tuple[ResourceConfig, AbstractWebhookProcessor]] = []
145
+ created_processors: list[
146
+ tuple[ResourceConfig | None, AbstractWebhookProcessor]
147
+ ] = []
140
148
  event_processor_names = []
141
149
 
142
150
  for processor_class in self._processors_classes[path]:
143
151
  processor = processor_class(webhook_event.clone())
144
152
  if await processor.should_process_event(webhook_event):
145
153
  event_processor_names.append(processor.__class__.__name__)
154
+ if processor.get_processor_type() == WebhookProcessorType.ACTION:
155
+ created_processors.append((None, processor))
156
+ continue
157
+
146
158
  kinds = await processor.get_matching_kinds(webhook_event)
147
159
  for kind in kinds:
148
160
  for resource in event.port_app_config.resources:
@@ -179,7 +191,10 @@ class LiveEventsProcessorManager(LiveEventsMixin, EventsMixin):
179
191
  event.set_timestamp(LiveEventTimestamp.FinishedProcessingWithError)
180
192
 
181
193
  async def _process_single_event(
182
- self, processor: AbstractWebhookProcessor, path: str, resource: ResourceConfig
194
+ self,
195
+ processor: AbstractWebhookProcessor,
196
+ path: str,
197
+ resource: ResourceConfig | None,
183
198
  ) -> WebhookEventRawResults:
184
199
  """Process a single event with a specific processor"""
185
200
  try:
@@ -199,7 +214,7 @@ class LiveEventsProcessorManager(LiveEventsMixin, EventsMixin):
199
214
  raise
200
215
 
201
216
  async def _execute_processor(
202
- self, processor: AbstractWebhookProcessor, resource: ResourceConfig
217
+ self, processor: AbstractWebhookProcessor, resource: ResourceConfig | None
203
218
  ) -> WebhookEventRawResults:
204
219
  """Execute a single processor within a max processing time"""
205
220
  try:
@@ -213,7 +228,7 @@ class LiveEventsProcessorManager(LiveEventsMixin, EventsMixin):
213
228
  )
214
229
 
215
230
  async def _process_webhook_request(
216
- self, processor: AbstractWebhookProcessor, resource: ResourceConfig
231
+ self, processor: AbstractWebhookProcessor, resource: ResourceConfig | None
217
232
  ) -> WebhookEventRawResults:
218
233
  """Process a webhook request with retry logic
219
234
 
@@ -235,9 +250,10 @@ class LiveEventsProcessorManager(LiveEventsMixin, EventsMixin):
235
250
  while True:
236
251
  try:
237
252
  webhook_event_raw_results = await processor.handle_event(
238
- payload, resource
253
+ payload, resource # type: ignore[arg-type]
239
254
  )
240
- webhook_event_raw_results.resource = resource
255
+ if resource is not None:
256
+ webhook_event_raw_results.resource = resource
241
257
  break
242
258
 
243
259
  except Exception as e:
@@ -258,7 +274,9 @@ class LiveEventsProcessorManager(LiveEventsMixin, EventsMixin):
258
274
  return webhook_event_raw_results
259
275
 
260
276
  def register_processor(
261
- self, path: str, processor: Type[AbstractWebhookProcessor]
277
+ self,
278
+ path: str,
279
+ processor: Type[AbstractWebhookProcessor],
262
280
  ) -> None:
263
281
  """Register a webhook processor for a specific path with optional filter
264
282
 
@@ -24,7 +24,7 @@ from port_ocean.core.integrations.mixins.utils import (
24
24
  resync_generator_wrapper,
25
25
  resync_function_wrapper,
26
26
  )
27
- from port_ocean.core.models import Entity, ProcessExecutionMode
27
+ from port_ocean.core.models import Entity, IntegrationFeatureFlag, ProcessExecutionMode
28
28
  from port_ocean.core.ocean_types import (
29
29
  RAW_RESULT,
30
30
  RESYNC_RESULT,
@@ -117,13 +117,13 @@ class SyncRawMixin(HandlerMixin, EventsMixin):
117
117
  logger.info(
118
118
  f"Found async generator function for {resource_config.kind} name: {task.__qualname__}"
119
119
  )
120
- results.append(resync_generator_wrapper(task, resource_config.kind,resource_config.port.items_to_parse))
120
+ results.append(resync_generator_wrapper(task, resource_config.kind, resource_config.port.items_to_parse_name, resource_config.port.items_to_parse))
121
121
  else:
122
122
  logger.info(
123
123
  f"Found sync function for {resource_config.kind} name: {task.__qualname__}"
124
124
  )
125
125
  task = typing.cast(Callable[[str], Awaitable[RAW_RESULT]], task)
126
- tasks.append(resync_function_wrapper(task, resource_config.kind))
126
+ tasks.append(resync_function_wrapper(task, resource_config.kind, resource_config.port.items_to_parse))
127
127
 
128
128
  logger.info(
129
129
  f"Found {len(tasks) + len(results)} resync tasks for {resource_config.kind}"
@@ -478,7 +478,7 @@ class SyncRawMixin(HandlerMixin, EventsMixin):
478
478
  bool: True if lakehouse data is enabled, False otherwise
479
479
  """
480
480
  flags = await ocean.port_client.get_organization_feature_flags()
481
- if "LAKEHOUSE_ELIGIBLE" in flags and ocean.config.lakehouse_enabled:
481
+ if IntegrationFeatureFlag.LAKEHOUSE_ELIGIBLE in flags and ocean.config.lakehouse_enabled:
482
482
  return True
483
483
  return False
484
484
 
@@ -1,13 +1,21 @@
1
+ import asyncio
2
+ import json
3
+ import multiprocessing
4
+ import os
5
+ import re
6
+ import shutil
7
+ import stat
8
+ import subprocess
9
+ import tempfile
1
10
  from contextlib import contextmanager
2
- from typing import Awaitable, Generator, Callable, cast
11
+ from typing import Any, AsyncGenerator, Awaitable, Callable, Generator, cast
3
12
 
13
+ import ijson
4
14
  from loguru import logger
5
15
 
6
- import asyncio
7
- import multiprocessing
8
-
16
+ from port_ocean.clients.port.utils import _http_client as _port_http_client
17
+ from port_ocean.context.ocean import ocean
9
18
  from port_ocean.core.handlers.entity_processor.jq_entity_processor import JQEntityProcessor
10
- from port_ocean.core.handlers.port_app_config.models import ResourceConfig
11
19
  from port_ocean.core.ocean_types import (
12
20
  ASYNC_GENERATOR_RESYNC_TYPE,
13
21
  RAW_RESULT,
@@ -20,11 +28,58 @@ from port_ocean.exceptions.core import (
20
28
  OceanAbortException,
21
29
  KindNotImplementedException,
22
30
  )
23
-
24
- from port_ocean.utils.async_http import _http_client
25
- from port_ocean.clients.port.utils import _http_client as _port_http_client
26
31
  from port_ocean.helpers.metric.metric import MetricType, MetricPhase
27
- from port_ocean.context.ocean import ocean
32
+ from port_ocean.utils.async_http import _http_client
33
+
34
+ def _process_path_type_items(
35
+ result: RAW_RESULT, items_to_parse: str | None = None
36
+ ) -> RAW_RESULT:
37
+ """
38
+ Process items in the result array to check for "__type": "path" fields.
39
+ If found, read the file contents and load them into a "content" field.
40
+ Skip processing if we're on the items_to_parse branch.
41
+ """
42
+ if not isinstance(result, list):
43
+ return result
44
+
45
+ # Skip processing if we're on the items_to_parse branch
46
+ if items_to_parse:
47
+ return result
48
+
49
+ processed_result = []
50
+ for item in result:
51
+ if isinstance(item, dict) and item.get("__type") == "path":
52
+ try:
53
+ # Read the file content and parse as JSON
54
+ file_path = item.get("file", {}).get("content", {}).get("path")
55
+ if file_path and os.path.exists(file_path):
56
+ with open(file_path, "r") as f:
57
+ content = json.loads(f.read())
58
+ # Create a copy of the item with the content field
59
+ processed_item = item.copy()
60
+ processed_item["file"]["content"] = content
61
+ processed_result.append(processed_item)
62
+ else:
63
+ # If file doesn't exist, keep the original item
64
+ processed_result.append(item)
65
+ except (json.JSONDecodeError, IOError, OSError) as e:
66
+ if isinstance(item, dict) and item.get("file") is not None:
67
+ content = item["file"].get("content") if isinstance(item["file"].get("content"), dict) else {}
68
+ data_path = content.get("path", None)
69
+ logger.warning(
70
+ f"Failed to read or parse file content for path {data_path}: {e}"
71
+ )
72
+ else:
73
+ logger.warning(
74
+ f"Failed to read or parse file content for unknown path: {e}. item: {json.dumps(item)}"
75
+ )
76
+ # Keep the original item if there's an error
77
+ processed_result.append(item)
78
+ else:
79
+ # Keep non-path type items as is
80
+ processed_result.append(item)
81
+
82
+ return processed_result
28
83
 
29
84
  @contextmanager
30
85
  def resync_error_handling() -> Generator[None, None, None]:
@@ -43,15 +98,16 @@ def resync_error_handling() -> Generator[None, None, None]:
43
98
 
44
99
 
45
100
  async def resync_function_wrapper(
46
- fn: Callable[[str], Awaitable[RAW_RESULT]], kind: str
101
+ fn: Callable[[str], Awaitable[RAW_RESULT]], kind: str, items_to_parse: str | None = None
47
102
  ) -> RAW_RESULT:
48
103
  with resync_error_handling():
49
104
  results = await fn(kind)
50
- return validate_result(results)
105
+ validated_results = validate_result(results)
106
+ return _process_path_type_items(validated_results, items_to_parse)
51
107
 
52
108
 
53
109
  async def resync_generator_wrapper(
54
- fn: Callable[[str], ASYNC_GENERATOR_RESYNC_TYPE], kind: str, items_to_parse: str | None = None
110
+ fn: Callable[[str], ASYNC_GENERATOR_RESYNC_TYPE], kind: str, items_to_parse_name: str, items_to_parse: str | None = None
55
111
  ) -> ASYNC_GENERATOR_RESYNC_TYPE:
56
112
  generator = fn(kind)
57
113
  errors = []
@@ -61,27 +117,23 @@ async def resync_generator_wrapper(
61
117
  with resync_error_handling():
62
118
  result = await anext(generator)
63
119
  if not ocean.config.yield_items_to_parse:
64
- yield validate_result(result)
120
+ validated_result = validate_result(result)
121
+ processed_result = _process_path_type_items(validated_result,items_to_parse)
122
+ yield processed_result
65
123
  else:
66
- batch_size = ocean.config.yield_items_to_parse_batch_size
67
124
  if items_to_parse:
68
125
  for data in result:
69
- items = await cast(JQEntityProcessor, ocean.app.integration.entity_processor)._search(data, items_to_parse)
70
- if not isinstance(items, list):
71
- logger.warning(
72
- f"Failed to parse items for JQ expression {items_to_parse}, Expected list but got {type(items)}."
73
- f" Skipping..."
74
- )
75
- yield []
76
- raw_data = [{"item": item, **data} for item in items]
77
- while True:
78
- raw_data_batch = raw_data[:batch_size]
79
- yield raw_data_batch
80
- raw_data = raw_data[batch_size:]
81
- if len(raw_data) == 0:
82
- break
126
+ data_path: str | None = None
127
+ if isinstance(data, dict) and data.get("file") is not None:
128
+ content = data["file"].get("content") if isinstance(data["file"].get("content"), dict) else {}
129
+ data_path = content.get("path", None)
130
+ bulks = get_items_to_parse_bulks(data, data_path, items_to_parse, items_to_parse_name, data.get("__base_jq", ".file.content"))
131
+ async for bulk in bulks:
132
+ yield bulk
83
133
  else:
84
- yield validate_result(result)
134
+ validated_result = validate_result(result)
135
+ processed_result = _process_path_type_items(validated_result, items_to_parse)
136
+ yield processed_result
85
137
  except OceanAbortException as error:
86
138
  errors.append(error)
87
139
  ocean.metrics.inc_metric(
@@ -101,6 +153,106 @@ def is_resource_supported(
101
153
  ) -> bool:
102
154
  return bool(resync_event_mapping[kind] or resync_event_mapping[None])
103
155
 
156
+ def _validate_jq_expression(expression: str) -> None:
157
+ """Validate jq expression to prevent command injection."""
158
+ try:
159
+ _ = cast(JQEntityProcessor, ocean.app.integration.entity_processor)._compile(expression)
160
+ except Exception as e:
161
+ raise ValueError(f"Invalid jq expression: {e}") from e
162
+ # Basic validation - reject expressions that could be dangerous
163
+ # Check for dangerous patterns (include, import, module)
164
+ dangerous_patterns = ['include', 'import', 'module', 'env', 'debug']
165
+ for pattern in dangerous_patterns:
166
+ # Use word boundary regex to match only complete words, not substrings
167
+ if re.search(rf'\b{re.escape(pattern)}\b', expression):
168
+ raise ValueError(f"Potentially dangerous pattern '{pattern}' found in jq expression")
169
+
170
+ # Special handling for 'env' - block environment variable access
171
+ if re.search(r'(?<!\w)\$ENV(?:\.)?', expression):
172
+ raise ValueError("Environment variable access '$ENV.' found in jq expression")
173
+ if re.search(r'\benv\.', expression):
174
+ raise ValueError("Environment variable access 'env.' found in jq expression")
175
+
176
+ def _create_secure_temp_file(suffix: str = ".json") -> str:
177
+ """Create a secure temporary file with restricted permissions."""
178
+ # Create temp directory if it doesn't exist
179
+ temp_dir = "/tmp/ocean"
180
+ os.makedirs(temp_dir, exist_ok=True)
181
+
182
+ # Create temporary file with secure permissions
183
+ fd, temp_path = tempfile.mkstemp(suffix=suffix, dir=temp_dir)
184
+ try:
185
+ # Set restrictive permissions (owner read/write only)
186
+ os.chmod(temp_path, stat.S_IRUSR | stat.S_IWUSR)
187
+ return temp_path
188
+ finally:
189
+ os.close(fd)
190
+
191
+ async def get_items_to_parse_bulks(raw_data: dict[Any, Any], data_path: str, items_to_parse: str, items_to_parse_name: str, base_jq: str) -> AsyncGenerator[list[dict[str, Any]], None]:
192
+ # Validate inputs to prevent command injection
193
+ _validate_jq_expression(items_to_parse)
194
+ items_to_parse = items_to_parse.replace(base_jq, ".") if data_path else items_to_parse
195
+
196
+ temp_data_path = None
197
+ temp_output_path = None
198
+
199
+ try:
200
+ # Create secure temporary files
201
+ if not data_path:
202
+ raw_data_serialized = json.dumps(raw_data)
203
+ temp_data_path = _create_secure_temp_file("_input.json")
204
+ with open(temp_data_path, "w") as f:
205
+ f.write(raw_data_serialized)
206
+ data_path = temp_data_path
207
+
208
+ temp_output_path = _create_secure_temp_file("_parsed.json")
209
+
210
+ delete_target = items_to_parse.split('|', 1)[0].strip() if not items_to_parse.startswith('map(') else '.'
211
+ base_jq_object_string = await _build_base_jq_object_string(raw_data, base_jq, delete_target)
212
+
213
+ # Build jq expression safely
214
+ jq_expression = f""". as $all
215
+ | ($all | {items_to_parse}) as $items
216
+ | $items
217
+ | map({{{items_to_parse_name}: ., {base_jq_object_string}}})"""
218
+
219
+ # Use subprocess with list arguments instead of shell=True
220
+
221
+ jq_path = shutil.which("jq") or "/bin/jq"
222
+ jq_args = [jq_path, jq_expression, data_path]
223
+
224
+ with open(temp_output_path, "w") as output_file:
225
+ result = subprocess.run(
226
+ jq_args,
227
+ stdout=output_file,
228
+ stderr=subprocess.PIPE,
229
+ text=True,
230
+ check=False # Don't raise exception, handle errors manually
231
+ )
232
+
233
+ if result.returncode != 0:
234
+ logger.error(f"Failed to parse items for JQ expression {items_to_parse}, error: {result.stderr}")
235
+ yield []
236
+ else:
237
+ with open(temp_output_path, "r") as f:
238
+ events_stream = get_events_as_a_stream(f, 'item', ocean.config.yield_items_to_parse_batch_size)
239
+ for items_bulk in events_stream:
240
+ yield items_bulk
241
+
242
+ except ValueError as e:
243
+ logger.error(f"Invalid jq expression: {e}")
244
+ yield []
245
+ except Exception as e:
246
+ logger.error(f"Failed to parse items for JQ expression {items_to_parse}, error: {e}")
247
+ yield []
248
+ finally:
249
+ # Cleanup temporary files
250
+ for temp_path in [temp_data_path, temp_output_path]:
251
+ if temp_path and os.path.exists(temp_path):
252
+ try:
253
+ os.remove(temp_path)
254
+ except OSError as e:
255
+ logger.warning(f"Failed to cleanup temporary file {temp_path}: {e}")
104
256
 
105
257
  def unsupported_kind_response(
106
258
  kind: str, available_resync_kinds: list[str]
@@ -108,6 +260,44 @@ def unsupported_kind_response(
108
260
  logger.error(f"Kind {kind} is not supported in this integration")
109
261
  return [], [KindNotImplementedException(kind, available_resync_kinds)]
110
262
 
263
+ async def _build_base_jq_object_string(raw_data: dict[Any, Any], base_jq: str, delete_target: str) -> str:
264
+ base_jq_object_before_parsing = await cast(JQEntityProcessor, ocean.app.integration.entity_processor)._search(raw_data, f"{base_jq} = {json.dumps("__all")}")
265
+ base_jq_object_before_parsing_serialized = json.dumps(base_jq_object_before_parsing)
266
+ base_jq_object_before_parsing_serialized = base_jq_object_before_parsing_serialized[1:-1] if len(base_jq_object_before_parsing_serialized) >= 2 else base_jq_object_before_parsing_serialized
267
+ base_jq_object_before_parsing_serialized = base_jq_object_before_parsing_serialized.replace("\"__all\"", f"(($all | del({delete_target})) // {{}})")
268
+ return base_jq_object_before_parsing_serialized
269
+
270
+
271
+ def get_events_as_a_stream(
272
+ stream: Any,
273
+ target_items: str = "item",
274
+ max_buffer_size_mb: int = 1
275
+ ) -> Generator[list[dict[str, Any]], None, None]:
276
+ events = ijson.sendable_list()
277
+ coro = ijson.items_coro(events, target_items, use_float=True)
278
+
279
+ # Convert MB to bytes for the buffer size
280
+ buffer_size = max_buffer_size_mb * 1024 * 1024
281
+
282
+ # Read chunks from the stream until exhausted
283
+ while True:
284
+ chunk = stream.read(buffer_size)
285
+ if not chunk: # End of stream
286
+ break
287
+
288
+ # Convert string to bytes if necessary (for text mode files)
289
+ if isinstance(chunk, str):
290
+ chunk = chunk.encode('utf-8')
291
+
292
+ coro.send(chunk)
293
+ yield events
294
+ del events[:]
295
+ try:
296
+ coro.close()
297
+ finally:
298
+ if events:
299
+ yield events
300
+ events[:] = []
111
301
 
112
302
  class ProcessWrapper(multiprocessing.Process):
113
303
  def __init__(self, *args, **kwargs):
@@ -134,3 +324,34 @@ def clear_http_client_context() -> None:
134
324
  _port_http_client.pop()
135
325
  except (RuntimeError, AttributeError):
136
326
  pass
327
+
328
+ class _AiterReader:
329
+ """
330
+ Wraps an iterable of byte chunks (e.g., response.iter_bytes())
331
+ and exposes a .read(n) method that ijson expects.
332
+ """
333
+ def __init__(self, iterable):
334
+ self._iter = iter(iterable)
335
+ self._buf = bytearray()
336
+ self._eof = False
337
+
338
+ def read(self, n=-1):
339
+ # If n < 0, return everything until EOF
340
+ if n is None or n < 0:
341
+ chunks = [bytes(self._buf)]
342
+ self._buf.clear()
343
+ chunks.extend(self._iter) # drain the iterator
344
+ return b"".join(chunks)
345
+
346
+ # Fill buffer until we have n bytes or hit EOF
347
+ while len(self._buf) < n and not self._eof:
348
+ try:
349
+ self._buf.extend(next(self._iter))
350
+ except StopIteration:
351
+ self._eof = True
352
+ break
353
+
354
+ # Serve up to n bytes
355
+ out = bytes(self._buf[:n])
356
+ del self._buf[:n]
357
+ return out