garf-executors 1.0.2__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,10 +11,16 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ """Workflow specifies steps of end-to-end fetching and processing."""
15
+
14
16
  from __future__ import annotations
15
17
 
18
+ import copy
16
19
  import os
17
20
  import pathlib
21
+ import re
22
+ from collections import defaultdict
23
+ from typing import Any
18
24
 
19
25
  import pydantic
20
26
  import smart_open
@@ -37,6 +43,13 @@ class QueryPath(pydantic.BaseModel):
37
43
  """Path file with query."""
38
44
 
39
45
  path: str
46
+ prefix: str | None = None
47
+
48
+ @property
49
+ def full_path(self) -> str:
50
+ if self.prefix:
51
+ return re.sub('/$', '', self.prefix) + '/' + self.path
52
+ return self.path
40
53
 
41
54
 
42
55
  class QueryDefinition(pydantic.BaseModel):
@@ -65,11 +78,13 @@ class ExecutionStep(ExecutionContext):
65
78
  alias: Optional alias to identify execution step.
66
79
  queries: Queries to run for a particular fetcher.
67
80
  context: Execution context for queries and fetcher.
81
+ parallel_threshold: Max allowed parallelism for the queries in the step.
68
82
  """
69
83
 
70
84
  fetcher: str | None = None
71
85
  alias: str | None = pydantic.Field(default=None, pattern=r'^[a-zA-Z0-9_]+$')
72
86
  queries: list[QueryPath | QueryDefinition | QueryFolder] | None = None
87
+ parallel_threshold: int | None = None
73
88
 
74
89
  @property
75
90
  def context(self) -> ExecutionContext:
@@ -86,17 +101,43 @@ class Workflow(pydantic.BaseModel):
86
101
 
87
102
  Attributes:
88
103
  steps: Contains one or several fetcher executions.
104
+ context: Query and fetcher parameters to overwrite in steps.
89
105
  """
90
106
 
91
107
  steps: list[ExecutionStep]
108
+ context: ExecutionContext | None = None
109
+
110
+ def model_post_init(self, __context__) -> None:
111
+ if context := self.context:
112
+ custom_parameters = defaultdict(dict)
113
+ if custom_macros := context.query_parameters.macro:
114
+ custom_parameters['query_parameters']['macro'] = custom_macros
115
+ if custom_templates := context.query_parameters.template:
116
+ custom_parameters['query_parameters']['template'] = custom_templates
117
+ if custom_fetcher_parameters := context.fetcher_parameters:
118
+ custom_parameters['fetcher_parameters'] = custom_fetcher_parameters
119
+ if custom_writer_parameters := context.writer_parameters:
120
+ custom_parameters['writer_parameters'] = custom_writer_parameters
121
+
122
+ if custom_parameters:
123
+ steps = self.steps
124
+ for i, step in enumerate(steps):
125
+ res = _merge_dicts(
126
+ step.model_dump(exclude_none=True), dict(custom_parameters)
127
+ )
128
+ steps[i] = ExecutionStep(**res)
92
129
 
93
130
  @classmethod
94
- def from_file(cls, path: str | pathlib.Path | os.PathLike[str]) -> Workflow:
131
+ def from_file(
132
+ cls,
133
+ path: str | pathlib.Path | os.PathLike[str],
134
+ context: ExecutionContext | None = None,
135
+ ) -> Workflow:
95
136
  """Builds workflow from local or remote yaml file."""
96
137
  with smart_open.open(path, 'r', encoding='utf-8') as f:
97
138
  data = yaml.safe_load(f)
98
139
  try:
99
- return Workflow(**data)
140
+ return Workflow(steps=data.get('steps'), context=context)
100
141
  except pydantic.ValidationError as e:
101
142
  raise GarfWorkflowError(f'Incorrect workflow:\n {e}') from e
102
143
 
@@ -104,6 +145,22 @@ class Workflow(pydantic.BaseModel):
104
145
  """Saves workflow to local or remote yaml file."""
105
146
  with smart_open.open(path, 'w', encoding='utf-8') as f:
106
147
  yaml.dump(
107
- self.model_dump(exclude_none=True).get('steps'), f, encoding='utf-8'
148
+ self.model_dump(exclude_none=True), f, encoding='utf-8', sort_keys=False
108
149
  )
109
150
  return f'Workflow is saved to {str(path)}'
151
+
152
+
153
+ def _merge_dicts(
154
+ dict1: dict[str, Any], dict2: dict[str, Any]
155
+ ) -> dict[str, Any]:
156
+ result = copy.deepcopy(dict1)
157
+ for key, value in dict2.items():
158
+ if (
159
+ key in result
160
+ and isinstance(result[key], dict)
161
+ and isinstance(value, dict)
162
+ ):
163
+ result[key] = _merge_dicts(result[key], value)
164
+ else:
165
+ result[key] = value
166
+ return result
@@ -0,0 +1,176 @@
1
+ # Copyright 2026 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Runs garf workflow."""
15
+
16
+ from __future__ import annotations
17
+
18
+ import logging
19
+ import pathlib
20
+ import re
21
+ from typing import Final
22
+
23
+ import yaml
24
+ from garf.executors import exceptions, setup
25
+ from garf.executors.telemetry import tracer
26
+ from garf.executors.workflows import workflow
27
+ from garf.io import reader
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ _REMOTE_FILES_PATTERN: Final[str] = (
32
+ '^(http|gs|s3|aruze|hdfs|webhdfs|ssh|scp|sftp)'
33
+ )
34
+ _SCRIPT_PATH = pathlib.Path(__file__).parent
35
+
36
+
37
+ class WorkflowRunner:
38
+ """Runs garf workflow.
39
+
40
+ Attributes:
41
+ workflow: Workflow to execute.
42
+ wf_parent: Optional location of a workflow file.
43
+ parallel_threshold: Max allowed parallelism for the queries in the workflow.
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ execution_workflow: workflow.Workflow,
49
+ wf_parent: pathlib.Path | str,
50
+ parallel_threshold: int = 10,
51
+ ) -> None:
52
+ """Initializes WorkflowRunner."""
53
+ self.workflow = execution_workflow
54
+ self.wf_parent = wf_parent
55
+ self.parallel_threshold = parallel_threshold
56
+
57
+ @classmethod
58
+ def from_file(
59
+ cls,
60
+ workflow_file: str | pathlib.Path,
61
+ ) -> WorkflowRunner:
62
+ """Initialized Workflow runner from a local or remote file."""
63
+ if isinstance(workflow_file, str):
64
+ workflow_file = pathlib.Path(workflow_file)
65
+ execution_workflow = workflow.Workflow.from_file(workflow_file)
66
+ return cls(
67
+ execution_workflow=execution_workflow, wf_parent=workflow_file.parent
68
+ )
69
+
70
+ def run(
71
+ self,
72
+ enable_cache: bool = False,
73
+ cache_ttl_seconds: int = 3600,
74
+ selected_aliases: list[str] | None = None,
75
+ skipped_aliases: list[str] | None = None,
76
+ simulate: bool = False,
77
+ ) -> list[str]:
78
+ skipped_aliases = skipped_aliases or []
79
+ selected_aliases = selected_aliases or []
80
+ reader_client = reader.create_reader('file')
81
+ execution_results = []
82
+ logger.info('Starting Garf Workflow...')
83
+ for i, step in enumerate(self.workflow.steps, 1):
84
+ step_name = f'{i}-{step.fetcher}'
85
+ if step.alias:
86
+ step_name = f'{step_name}-{step.alias}'
87
+ if step.alias in skipped_aliases:
88
+ logger.warning(
89
+ 'Skipping step %d, fetcher: %s, alias: %s',
90
+ i,
91
+ step.fetcher,
92
+ step.alias,
93
+ )
94
+ continue
95
+ if selected_aliases and step.alias not in selected_aliases:
96
+ logger.warning(
97
+ 'Skipping step %d, fetcher: %s, alias: %s',
98
+ i,
99
+ step.fetcher,
100
+ step.alias,
101
+ )
102
+ continue
103
+ with tracer.start_as_current_span(step_name):
104
+ logger.info(
105
+ 'Running step %d, fetcher: %s, alias: %s', i, step.fetcher, step.alias
106
+ )
107
+ query_executor = setup.setup_executor(
108
+ source=step.fetcher,
109
+ fetcher_parameters=step.fetcher_parameters,
110
+ enable_cache=enable_cache,
111
+ cache_ttl_seconds=cache_ttl_seconds,
112
+ simulate=simulate,
113
+ writers=step.writer,
114
+ writer_parameters=step.writer_parameters,
115
+ )
116
+ batch = {}
117
+ if not (queries := step.queries):
118
+ logger.error('Please provide one or more queries to run')
119
+ raise exceptions.GarfExecutorError(
120
+ 'Please provide one or more queries to run'
121
+ )
122
+ for query in queries:
123
+ if isinstance(query, workflow.QueryPath):
124
+ query_path = query.full_path
125
+ if re.match(_REMOTE_FILES_PATTERN, query_path):
126
+ batch[query.path] = reader_client.read(query_path)
127
+ else:
128
+ if not query.prefix:
129
+ query_path = self.wf_parent / pathlib.Path(query.path)
130
+ if not query_path.exists():
131
+ raise workflow.GarfWorkflowError(
132
+ f'Query: {query_path} not found'
133
+ )
134
+ batch[query.path] = reader_client.read(query_path)
135
+ elif isinstance(query, workflow.QueryFolder):
136
+ query_path = self.wf_parent / pathlib.Path(query.folder)
137
+ if not query_path.exists():
138
+ raise workflow.GarfWorkflowError(
139
+ f'Folder: {query_path} not found'
140
+ )
141
+ for p in query_path.rglob('*'):
142
+ if p.suffix == '.sql':
143
+ batch[p.stem] = reader_client.read(p)
144
+ else:
145
+ batch[query.query.title] = query.query.text
146
+ query_executor.execute_batch(
147
+ batch,
148
+ step.context,
149
+ step.parallel_threshold or self.parallel_threshold,
150
+ )
151
+ execution_results.append(step_name)
152
+ return execution_results
153
+
154
+ def compile(self, path: str | pathlib.Path) -> str:
155
+ """Saves workflow with expanded anchors."""
156
+ return self.workflow.save(path)
157
+
158
+ def deploy(self, path: str | pathlib.Path) -> str:
159
+ """Prepares workflow for deployment to Google Cloud Workflows."""
160
+ wf = self.workflow.model_dump(exclude_none=True).get('steps')
161
+ with open(_SCRIPT_PATH / 'gcp_workflow.yaml', 'r', encoding='utf-8') as f:
162
+ cloud_workflow_run_template = yaml.safe_load(f)
163
+ init = {
164
+ 'init': {
165
+ 'assign': [{'pairs': wf}],
166
+ },
167
+ }
168
+ cloud_workflow = {
169
+ 'main': {
170
+ 'params': [],
171
+ 'steps': [init, cloud_workflow_run_template],
172
+ },
173
+ }
174
+ with open(path, 'w', encoding='utf-8') as f:
175
+ yaml.dump(cloud_workflow, f, sort_keys=False)
176
+ return f'Workflow is saved to {path}'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: garf-executors
3
- Version: 1.0.2
3
+ Version: 1.2.0
4
4
  Summary: Executes queries against API and writes data to local/remote storage.
5
5
  Author-email: "Google Inc. (gTech gPS CSE team)" <no-reply@google.com>, Andrei Markin <andrey.markin.ppc@gmail.com>
6
6
  License: Apache 2.0
@@ -36,8 +36,15 @@ Provides-Extra: gcp
36
36
  Requires-Dist: opentelemetry-exporter-gcp-trace; extra == "gcp"
37
37
  Provides-Extra: server
38
38
  Requires-Dist: fastapi[standard]; extra == "server"
39
+ Requires-Dist: pydantic-settings; extra == "server"
39
40
  Requires-Dist: opentelemetry-instrumentation-fastapi; extra == "server"
40
41
  Requires-Dist: typer; extra == "server"
42
+ Requires-Dist: grpcio-reflection; extra == "server"
43
+ Provides-Extra: tests
44
+ Requires-Dist: pytest; extra == "tests"
45
+ Requires-Dist: pytest-mock; extra == "tests"
46
+ Requires-Dist: pytest-xdist; extra == "tests"
47
+ Requires-Dist: pytest-grpc; extra == "tests"
41
48
  Provides-Extra: all
42
49
  Requires-Dist: garf-executors[bq,gcp,server,sql]; extra == "all"
43
50
 
@@ -1,23 +1,27 @@
1
- garf/executors/__init__.py,sha256=dO6T9Z9Q5zc3BszqaTQE8fK5weyV_RTmj8ElV0mUFhQ,1941
2
- garf/executors/api_executor.py,sha256=bLhG9FLgAfdO16BW22YhAq0SXY8c-WuNW13e6bUu_VE,4380
3
- garf/executors/bq_executor.py,sha256=f5jQllFAE3b_ajQlosZfecovnrIDfyOAO-k8_AjzRqQ,5774
1
+ garf/executors/__init__.py,sha256=i7udlIWHIzMtLJmErjJJGiuauAGlApUlyB2sdX8gR4I,824
2
+ garf/executors/api_executor.py,sha256=96slSspngBQ2_R15gLxaUd565EqCg7syyC2yf2oK6ic,7118
3
+ garf/executors/bq_executor.py,sha256=-LlDFNF-OvPy-9_QzY1d58emOs5eehsgIDPYUkm-H1A,6696
4
4
  garf/executors/config.py,sha256=w5g9EYabPtK-6CPl3G87owUvBiqi0_r_aeAwISpK-zw,1720
5
5
  garf/executors/exceptions.py,sha256=U_7Q2ZMOUf89gzZd2pw7y3g7i1NeByPPKfpZ3q7p3ZU,662
6
- garf/executors/execution_context.py,sha256=us_S-x2jsJOBo4Vm78DhCgcgnu_HPc-f9-q_XI_eowU,3840
6
+ garf/executors/execution_context.py,sha256=_T9sEkkImTjtdwJcnxbCEZXA6cpXh5GVs9CCGcGdGcw,3380
7
7
  garf/executors/executor.py,sha256=SpPONsYHO49WbZ2HRjTKujrNH64BbrSfKcZSn4Dcd6o,3830
8
- garf/executors/fetchers.py,sha256=Jtdl9A2G4xEM-pnUg_fWv3cltcMpM7JjdTcLeJK5AVY,2604
9
- garf/executors/garf_pb2.py,sha256=mYvBYcAnZtyDflXGN2GZLM2KM0Nv9hoJs55zfQU_l1o,2564
10
- garf/executors/garf_pb2_grpc.py,sha256=w8D_r3wpj1ZZstkIFogY679-lSCcL2iZQ4QLO8IfToY,3359
11
- garf/executors/query_processor.py,sha256=xCTkm2V0iMfsjJ0ydwyVRFFiZh1ilhJKzglYlReRKAk,2250
12
- garf/executors/sql_executor.py,sha256=Z1E7aL9HiYcqfUJMuESlu4iZjVy5hNMvplu2hLotK9I,4780
8
+ garf/executors/fetchers.py,sha256=JemMM4FU4-Cpp2SxmMBtLgHgGy2gDsQSuuAozTh7Yjw,4477
9
+ garf/executors/garf_pb2.py,sha256=OIKC7NGErbUckhB4pQ6HycB9My5X3FvaImARnvhPExM,3450
10
+ garf/executors/garf_pb2_grpc.py,sha256=repGTh-ZDnNyAxMcJxAf0cLfr_JjX2AzZkY6PfZy0xM,4957
11
+ garf/executors/query_processor.py,sha256=IM5qXBNYJSUlsY2djYx8WDeX367cMhI1ZrITT22TcvI,2932
12
+ garf/executors/setup.py,sha256=txQtkpMNQ5WlYF2MzJdvPop07F1IqNN_X97mzedghDc,2604
13
+ garf/executors/sql_executor.py,sha256=TzDzVxpNjyniu9ZHubcLUfYg0igGc1cOUXH9ETt1WL8,5226
13
14
  garf/executors/telemetry.py,sha256=wLWAdJZmGinffIMv5FZNKaAUusgACTvokwhMFz2UCQ0,747
14
- garf/executors/workflow.py,sha256=WFsTV916mw_DsWttrLc8tNnDl0vwJAQwVlIHRdNaNDw,3007
15
15
  garf/executors/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- garf/executors/entrypoints/cli.py,sha256=8vk8jElesqqn0q2Te3v6W1V-kts-28N5sncovaZqq3o,6699
17
- garf/executors/entrypoints/grpc_server.py,sha256=__y2uR2pwlnYy1YFm-2EjFYxBKZmN1a9XY9MNsfrhag,2295
18
- garf/executors/entrypoints/server.py,sha256=vXGz-f7C1aweaMNJTJEjZLIg42d0T01HPG_lxlj-FPs,3472
19
- garf/executors/entrypoints/tracer.py,sha256=-UM1RNtW90jFsQiavVzhIWzB3cOCTcCwc6v7YIVflDk,1912
16
+ garf/executors/entrypoints/cli.py,sha256=x7RAQngeb3rr3rIhJ3d01q38G6DhTQYgt6p3nSPegxg,6123
17
+ garf/executors/entrypoints/grpc_server.py,sha256=O7hinPjlVurSJjzudDFvVjkxX7xJHRcwFk4K4aJK0uo,2783
18
+ garf/executors/entrypoints/server.py,sha256=rLnOgZqeWwZ9UsJZ8LXDvZ101JrUIwMTpDzC5t-6wsY,4988
19
+ garf/executors/entrypoints/tracer.py,sha256=Oug3tePD8Yg5x6r1KNDx8RL_yAML0egy1DEFDobW8Uk,2792
20
20
  garf/executors/entrypoints/utils.py,sha256=5XiGR2IOxdzAOY0lEWUeUV7tIpKBGRnQaIwBYvzQB7c,4337
21
+ garf/executors/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ garf/executors/workflows/gcp_workflow.yaml,sha256=KKfyFaBihFiLrsNhZct0cccxMV8CnVJUsxBsXQ8VP-g,1743
23
+ garf/executors/workflows/workflow.py,sha256=xqPFHq6xJSWhpbw47U1dUx9AQpYJyMGqA6pRG-4CFQo,4952
24
+ garf/executors/workflows/workflow_runner.py,sha256=A3NxfBq9TgXs9CCaK8MzJBHVLTCDtF1sSuy_BLWbQco,5929
21
25
  garf_executors/__init__.py,sha256=5Ol67ktUcC0q5d5pGblYfdlAsraJC-Gcr2U0uCN6rSs,772
22
26
  garf_executors/api_executor.py,sha256=lmrPn6aheryM2jLRL2enU8GuKVUod3kEVkMYgjXn5EM,785
23
27
  garf_executors/bq_executor.py,sha256=wQ8pd4d6dMByHtYl_i-FbebPKO5WRcvC_y9asG8H3Zk,784
@@ -35,8 +39,8 @@ garf_executors/entrypoints/grcp_server.py,sha256=HiYsfk31OgkPA4jcED3htJGidkRZH5N
35
39
  garf_executors/entrypoints/server.py,sha256=JZCklhqx74PIhc4GIoOr2nNZz9n7QCfIm_vGyd3Q3dQ,815
36
40
  garf_executors/entrypoints/tracer.py,sha256=JlDSgeDP0Q5Lk_pZLASDXPgZCPaKkWqZgaWOQ7JB-Bs,815
37
41
  garf_executors/entrypoints/utils.py,sha256=iF-LBfKjrAhEW6HShh69RCPWVPC4Rf8iV5JEYSMhsx0,814
38
- garf_executors-1.0.2.dist-info/METADATA,sha256=zNOfNeIWJiEoaWz2ONVf0gQeVRc5q6DM8CxnUxmoB90,3303
39
- garf_executors-1.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- garf_executors-1.0.2.dist-info/entry_points.txt,sha256=0IBZun3_hC4HYU-1krlbjTArZym3phu4jxYXs809ilw,61
41
- garf_executors-1.0.2.dist-info/top_level.txt,sha256=UaHdWdgQhbiHyRzpYC-vW3Q7pdgbxXvTTBvDA655Jq4,20
42
- garf_executors-1.0.2.dist-info/RECORD,,
42
+ garf_executors-1.2.0.dist-info/METADATA,sha256=s8o4jKBZBT6uAxL6NMoU5pgYGv2JgEHx2f7LVG_jDj0,3605
43
+ garf_executors-1.2.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
44
+ garf_executors-1.2.0.dist-info/entry_points.txt,sha256=0IBZun3_hC4HYU-1krlbjTArZym3phu4jxYXs809ilw,61
45
+ garf_executors-1.2.0.dist-info/top_level.txt,sha256=UaHdWdgQhbiHyRzpYC-vW3Q7pdgbxXvTTBvDA655Jq4,20
46
+ garf_executors-1.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5