Flowfile 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +27 -6
- flowfile/api.py +1 -0
- flowfile/web/__init__.py +2 -2
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
- flowfile/web/static/assets/CloudConnectionManager-c20a740f.js +783 -0
- flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
- flowfile/web/static/assets/CloudStorageReader-960b400a.js +437 -0
- flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
- flowfile/web/static/assets/CloudStorageWriter-e3decbdd.js +430 -0
- flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-d67e2405.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-a81e0f7e.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-9ea35e84.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-9578bfa5.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-19531098.js} +9 -9
- flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-40476474.js} +47141 -43697
- flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-2297ef96.js} +6 -6
- flowfile/web/static/assets/{Filter-f87bb897.js → Filter-f211c03a.js} +8 -8
- flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
- flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-4207ea31.js} +75 -9
- flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-bf120df0.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-5bb7497a.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-92c81b65.js} +6 -6
- flowfile/web/static/assets/{Join-eec38203.js → Join-4e49a274.js} +23 -15
- flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
- flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-90998ae8.js} +106 -34
- flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
- flowfile/web/static/assets/{Output-3b2ca045.js → Output-81e3e917.js} +4 -4
- flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-a3419842.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-72710deb.js} +6 -6
- flowfile/web/static/assets/{Read-07acdc9a.js → Read-c4059daf.js} +6 -6
- flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-c2b5e095.js} +5 -5
- flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-10baf191.js} +6 -6
- flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-3ed9a0ae.js} +5 -5
- flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-0d49c0e8.js} +2 -2
- flowfile/web/static/assets/{Select-32b28406.js → Select-8a02a0b3.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-4c0f45f5.js} +1 -1
- flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-f55c9f9d.js} +6 -6
- flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-5dbc2145.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-a1768e52.js} +2 -2
- flowfile/web/static/assets/{Union-39eecc6c.js → Union-f2aefdc9.js} +5 -5
- flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-46b250da.js} +8 -8
- flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-25ac84cc.js} +5 -5
- flowfile/web/static/assets/api-6ef0dcef.js +80 -0
- flowfile/web/static/assets/{api-44ca9e9c.js → api-a0abbdc7.js} +1 -1
- flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
- flowfile/web/static/assets/{designer-267d44f1.js → designer-13eabd83.js} +36 -34
- flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-b87e7f6f.js} +1 -1
- flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-13564764.js} +1 -1
- flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-fd2cd6f9.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-71e11604.js} +3 -3
- flowfile/web/static/assets/{index-e235a8bc.js → index-f6c15e76.js} +59 -22
- flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-988d9efe.js} +3 -3
- flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-dd636aa2.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-af36165e.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-2847001e.js} +2 -1
- flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-0371da73.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/METADATA +9 -4
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/RECORD +131 -124
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/entry_points.txt +2 -0
- flowfile_core/__init__.py +3 -0
- flowfile_core/auth/jwt.py +39 -0
- flowfile_core/configs/node_store/nodes.py +9 -6
- flowfile_core/configs/settings.py +6 -5
- flowfile_core/database/connection.py +63 -15
- flowfile_core/database/init_db.py +0 -1
- flowfile_core/database/models.py +49 -2
- flowfile_core/flowfile/code_generator/code_generator.py +472 -17
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
- flowfile_core/flowfile/extensions.py +1 -1
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1062 -311
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +3 -1
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
- flowfile_core/flowfile/flow_graph.py +718 -253
- flowfile_core/flowfile/flow_graph_utils.py +2 -2
- flowfile_core/flowfile/flow_node/flow_node.py +563 -117
- flowfile_core/flowfile/flow_node/models.py +154 -20
- flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
- flowfile_core/flowfile/handler.py +2 -33
- flowfile_core/flowfile/manage/open_flowfile.py +1 -2
- flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
- flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
- flowfile_core/flowfile/util/calculate_layout.py +0 -2
- flowfile_core/flowfile/utils.py +35 -26
- flowfile_core/main.py +35 -15
- flowfile_core/routes/cloud_connections.py +77 -0
- flowfile_core/routes/logs.py +2 -7
- flowfile_core/routes/public.py +1 -0
- flowfile_core/routes/routes.py +130 -90
- flowfile_core/routes/secrets.py +72 -14
- flowfile_core/schemas/__init__.py +8 -0
- flowfile_core/schemas/cloud_storage_schemas.py +215 -0
- flowfile_core/schemas/input_schema.py +121 -71
- flowfile_core/schemas/output_model.py +19 -3
- flowfile_core/schemas/schemas.py +150 -12
- flowfile_core/schemas/transform_schema.py +175 -35
- flowfile_core/utils/utils.py +40 -1
- flowfile_core/utils/validate_setup.py +41 -0
- flowfile_frame/__init__.py +9 -1
- flowfile_frame/cloud_storage/frame_helpers.py +39 -0
- flowfile_frame/cloud_storage/secret_manager.py +73 -0
- flowfile_frame/expr.py +28 -1
- flowfile_frame/expr.pyi +76 -61
- flowfile_frame/flow_frame.py +481 -208
- flowfile_frame/flow_frame.pyi +140 -91
- flowfile_frame/flow_frame_methods.py +160 -22
- flowfile_frame/group_frame.py +3 -0
- flowfile_frame/utils.py +25 -3
- flowfile_worker/external_sources/s3_source/main.py +216 -0
- flowfile_worker/external_sources/s3_source/models.py +142 -0
- flowfile_worker/funcs.py +51 -6
- flowfile_worker/models.py +22 -2
- flowfile_worker/routes.py +40 -38
- flowfile_worker/utils.py +1 -1
- test_utils/s3/commands.py +46 -0
- test_utils/s3/data_generator.py +292 -0
- test_utils/s3/demo_data_generator.py +186 -0
- test_utils/s3/fixtures.py +214 -0
- flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
- flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
- flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
- flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
- flowfile_core/schemas/defaults.py +0 -9
- flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
- flowfile_core/schemas/models.py +0 -193
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
- flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
- flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/LICENSE +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.7.dist-info}/WHEEL +0 -0
- {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_frame/cloud_storage}/__init__.py +0 -0
- {flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
- {flowfile_worker/external_sources/airbyte_sources → test_utils/s3}/__init__.py +0 -0
flowfile_core/schemas/models.py
DELETED
|
@@ -1,193 +0,0 @@
|
|
|
1
|
-
from pydantic import BaseModel
|
|
2
|
-
from typing import List, Any, Callable
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from dataclasses import dataclass, field
|
|
5
|
-
import os
|
|
6
|
-
import mimetypes
|
|
7
|
-
from copy import deepcopy
|
|
8
|
-
from flowfile_core.configs.settings import FILE_LOCATION
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class DirItem:
|
|
12
|
-
name: str
|
|
13
|
-
full_path: str
|
|
14
|
-
path: str
|
|
15
|
-
type: str
|
|
16
|
-
stats: os.stat = None
|
|
17
|
-
creation_date: datetime = None
|
|
18
|
-
access_date: datetime = None
|
|
19
|
-
modification_date: datetime = None
|
|
20
|
-
source_path: str = None
|
|
21
|
-
|
|
22
|
-
def __init__(self, name: str, path: str,
|
|
23
|
-
stats: os.stat = None,
|
|
24
|
-
creation_date: datetime = None,
|
|
25
|
-
modification_date: datetime = None,
|
|
26
|
-
access_date: datetime = None, *args, **kwargs):
|
|
27
|
-
self.full_path = os.path.relpath(path)
|
|
28
|
-
if name == FILE_LOCATION:
|
|
29
|
-
self.name = self.full_path
|
|
30
|
-
self.source_path = self.full_path
|
|
31
|
-
else:
|
|
32
|
-
self.name = name
|
|
33
|
-
self.source_path = os.sep.join(os.path.split(self.full_path)[:-1])
|
|
34
|
-
self.path = self.full_path
|
|
35
|
-
self.stats = os.stat(self.full_path) if stats is None else stats
|
|
36
|
-
self.creation_date = datetime.fromtimestamp(self.stats.st_ctime) if creation_date is None else creation_date
|
|
37
|
-
self.modification_date = datetime.fromtimestamp(self.stats.st_mtime) if modification_date is None else modification_date
|
|
38
|
-
self.access_date = datetime.fromtimestamp(self.stats.st_atime) if access_date is None else access_date
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@dataclass
|
|
42
|
-
class DirFile(DirItem):
|
|
43
|
-
ext: str
|
|
44
|
-
file_size: int
|
|
45
|
-
size: int
|
|
46
|
-
mimetype: str = None
|
|
47
|
-
|
|
48
|
-
def __init__(self, name: str, path: str):
|
|
49
|
-
ext = os.path.splitext(name)[-1]
|
|
50
|
-
if ext == '':
|
|
51
|
-
ext = 'unk'
|
|
52
|
-
self.ext = ext
|
|
53
|
-
self.type = 'file'
|
|
54
|
-
self.mimetype = mimetypes.guess_type(path)[0]
|
|
55
|
-
super().__init__(name, path)
|
|
56
|
-
self.file_size = self.stats.st_size
|
|
57
|
-
self.size = self.file_size_in_kb
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
@property
|
|
61
|
-
def file_size_in_kb(self) -> int:
|
|
62
|
-
return int(self.file_size/1024)
|
|
63
|
-
|
|
64
|
-
def json_repr(self):
|
|
65
|
-
self_dict = deepcopy(self.__dict__)
|
|
66
|
-
self_dict.pop('stats')
|
|
67
|
-
return self_dict
|
|
68
|
-
|
|
69
|
-
@dataclass
|
|
70
|
-
class DirLocation(DirItem):
|
|
71
|
-
all_items: List[str] = field(default_factory=list)
|
|
72
|
-
number_of_items: int = -1
|
|
73
|
-
files: List[DirFile] = field(default_factory=list)
|
|
74
|
-
directories: List["DirLocation"] = field(default_factory=list)
|
|
75
|
-
__size: int = -1
|
|
76
|
-
|
|
77
|
-
def __init__(self, name: str, path: str = None):
|
|
78
|
-
path = '.' + os.path.sep + name if path is None else path
|
|
79
|
-
self.all_items = os.listdir(path)
|
|
80
|
-
self.files = []
|
|
81
|
-
self.directories = []
|
|
82
|
-
self.type = 'dir'
|
|
83
|
-
super().__init__(name, path)
|
|
84
|
-
self.create_structure()
|
|
85
|
-
self.number_of_items = len(self.all_items)
|
|
86
|
-
|
|
87
|
-
def as_dict(self):
|
|
88
|
-
return {**self.__dict__,**{'size':self.size}}
|
|
89
|
-
|
|
90
|
-
def get_dirs(self):
|
|
91
|
-
return self.directories
|
|
92
|
-
|
|
93
|
-
@property
|
|
94
|
-
def size(self) -> int:
|
|
95
|
-
if self.__size<0:
|
|
96
|
-
size = 0
|
|
97
|
-
for file in self.files:
|
|
98
|
-
size += file.size
|
|
99
|
-
for d in self.directories:
|
|
100
|
-
size += d.size
|
|
101
|
-
self.__size = size
|
|
102
|
-
return self.__size
|
|
103
|
-
|
|
104
|
-
def create_structure(self):
|
|
105
|
-
for item_name in self.all_items:
|
|
106
|
-
|
|
107
|
-
ref = os.path.join(self.full_path, item_name)
|
|
108
|
-
if os.path.isdir(ref):
|
|
109
|
-
self.directories.append(self.__class__(item_name,ref))
|
|
110
|
-
elif os.path.isfile(ref):
|
|
111
|
-
self.files.append(DirFile(item_name,ref))
|
|
112
|
-
|
|
113
|
-
def json_repr(self):
|
|
114
|
-
self_dict = deepcopy(self.__dict__)
|
|
115
|
-
self_dict.pop('stats')
|
|
116
|
-
files: List[DirFile] = self_dict.pop('files')
|
|
117
|
-
directories: List[DirLocation] = self_dict.pop('directories')
|
|
118
|
-
self_dict.pop('all_items')
|
|
119
|
-
self_dict['files'] = [f.json_repr() for f in files]
|
|
120
|
-
self_dict['directories'] = [d.json_repr() for d in directories]
|
|
121
|
-
return self_dict
|
|
122
|
-
|
|
123
|
-
class Country(BaseModel):
|
|
124
|
-
id: int
|
|
125
|
-
name: str
|
|
126
|
-
abbreviation: str
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
class Address(BaseModel):
|
|
130
|
-
street: str = None
|
|
131
|
-
house_number: int = None
|
|
132
|
-
house_number_addition: str = None
|
|
133
|
-
city: str = None
|
|
134
|
-
zipcode: str = None
|
|
135
|
-
country: Country = None
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
class Team(Address):
|
|
139
|
-
id: int
|
|
140
|
-
name: str
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
class User(Address):
|
|
144
|
-
user_name: str
|
|
145
|
-
hashed_password: str
|
|
146
|
-
user_id: int
|
|
147
|
-
team: Team
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
class ExampleValues(BaseModel):
|
|
151
|
-
pass
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
class SourceStats(BaseModel):
|
|
155
|
-
id: int
|
|
156
|
-
created_by: User
|
|
157
|
-
created_date: datetime = datetime.now()
|
|
158
|
-
updated_by: User
|
|
159
|
-
updated_date: datetime = datetime.now()
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
class Source(SourceStats):
|
|
163
|
-
name: str
|
|
164
|
-
team: Team
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
class Table(SourceStats):
|
|
168
|
-
name: str
|
|
169
|
-
table_type: str = None
|
|
170
|
-
table_fields: List['TableField'] = None
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
class TableField(BaseModel):
|
|
174
|
-
id: int
|
|
175
|
-
source: Source
|
|
176
|
-
name: str
|
|
177
|
-
python_type: type
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
class FieldMapper(SourceStats):
|
|
181
|
-
mapping_type: str = None # direct, formula, default value
|
|
182
|
-
input_fields: List[TableField] = None
|
|
183
|
-
output_fields: List[TableField]
|
|
184
|
-
source_table: Table
|
|
185
|
-
target_table: Table
|
|
186
|
-
formula: Callable = None
|
|
187
|
-
default_value: Any = None
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
class TableMapper(SourceStats):
|
|
191
|
-
source_table: Table
|
|
192
|
-
target_table: Table
|
|
193
|
-
|
|
@@ -1,161 +0,0 @@
|
|
|
1
|
-
from airbyte.caches import DuckDBCache
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import atexit
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
import filelock
|
|
7
|
-
from contextlib import contextmanager
|
|
8
|
-
|
|
9
|
-
from flowfile_worker.configs import logger
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class DuckDBCacheManager:
|
|
13
|
-
"""
|
|
14
|
-
Manages DuckDB cache instances with multiprocessing-safe lock handling.
|
|
15
|
-
Coordinates cache access across different processes using file-based locks.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
def __init__(self,
|
|
19
|
-
base_path: Path = Path.home() / ".flowfile/.tmp",
|
|
20
|
-
max_retries: int = 3,
|
|
21
|
-
retry_delay: float = 0.5):
|
|
22
|
-
logger.info(f"Initializing DuckDBCacheManager with base_path: {base_path}")
|
|
23
|
-
|
|
24
|
-
# Shared resources path
|
|
25
|
-
self.base_path = base_path
|
|
26
|
-
self.base_path.mkdir(parents=True, exist_ok=True)
|
|
27
|
-
|
|
28
|
-
# Central coordination directory
|
|
29
|
-
self.coordinator_dir = self.base_path / "coordinator"
|
|
30
|
-
self.coordinator_dir.mkdir(exist_ok=True)
|
|
31
|
-
|
|
32
|
-
# Process-specific information
|
|
33
|
-
self.process_id = os.getpid()
|
|
34
|
-
self.max_retries = max_retries
|
|
35
|
-
self.retry_delay = retry_delay
|
|
36
|
-
|
|
37
|
-
# Global lock for cache allocation
|
|
38
|
-
self.global_lock_file = self.coordinator_dir / "global.lock"
|
|
39
|
-
logger.debug(f"Process {self.process_id} initialized with global lock at {self.global_lock_file}")
|
|
40
|
-
|
|
41
|
-
# Register cleanup
|
|
42
|
-
atexit.register(self.cleanup_process_resources)
|
|
43
|
-
|
|
44
|
-
@contextmanager
|
|
45
|
-
def get_cache(self) -> 'DuckDBCache':
|
|
46
|
-
"""
|
|
47
|
-
Get an available cache instance with proper locking.
|
|
48
|
-
"""
|
|
49
|
-
logger.debug(f"Process {self.process_id} attempting to get cache")
|
|
50
|
-
cache_id = None
|
|
51
|
-
cache = None
|
|
52
|
-
|
|
53
|
-
# First, try to get any available existing cache
|
|
54
|
-
with filelock.FileLock(str(self.global_lock_file), timeout=0.1):
|
|
55
|
-
logger.debug(f"Process {self.process_id} acquired global lock")
|
|
56
|
-
for i in range(10): # Check first 10 possible cache slots
|
|
57
|
-
try:
|
|
58
|
-
cache_lock = filelock.FileLock(
|
|
59
|
-
str(self.coordinator_dir / f"cache_{i}.lock"),
|
|
60
|
-
timeout=0.1
|
|
61
|
-
)
|
|
62
|
-
cache_lock.acquire()
|
|
63
|
-
|
|
64
|
-
# If we got the lock, use this cache slot
|
|
65
|
-
cache_id = i
|
|
66
|
-
cache_path = self.base_path / f"airbyte_cache_{i}"
|
|
67
|
-
logger.info(f"Process {self.process_id} acquired cache slot {i} at {cache_path}")
|
|
68
|
-
|
|
69
|
-
cache = DuckDBCache(
|
|
70
|
-
db_path=cache_path,
|
|
71
|
-
schema_name="main"
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
# Keep track of which process is using this cache
|
|
75
|
-
with open(self.coordinator_dir / f"cache_{i}.pid", 'w') as f:
|
|
76
|
-
f.write(str(self.process_id))
|
|
77
|
-
|
|
78
|
-
break
|
|
79
|
-
|
|
80
|
-
except filelock.Timeout:
|
|
81
|
-
logger.debug(f"Process {self.process_id} failed to acquire lock for cache slot {i}")
|
|
82
|
-
continue
|
|
83
|
-
|
|
84
|
-
if cache is None:
|
|
85
|
-
logger.info(f"Process {self.process_id} couldn't acquire existing cache, creating new one")
|
|
86
|
-
# If no existing cache is available, create a new one
|
|
87
|
-
cache_id = self._create_new_cache_slot()
|
|
88
|
-
cache_path = self.base_path / f"airbyte_cache_{cache_id}"
|
|
89
|
-
cache = DuckDBCache(
|
|
90
|
-
db_path=cache_path,
|
|
91
|
-
schema_name="main"
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
try:
|
|
95
|
-
yield cache
|
|
96
|
-
finally:
|
|
97
|
-
# Cleanup
|
|
98
|
-
if cache_id is not None:
|
|
99
|
-
lock_file = self.coordinator_dir / f"cache_{cache_id}.lock"
|
|
100
|
-
pid_file = self.coordinator_dir / f"cache_{cache_id}.pid"
|
|
101
|
-
logger.debug(f"Process {self.process_id} cleaning up cache slot {cache_id}")
|
|
102
|
-
|
|
103
|
-
try:
|
|
104
|
-
if lock_file.exists():
|
|
105
|
-
lock_file.unlink()
|
|
106
|
-
if pid_file.exists():
|
|
107
|
-
pid_file.unlink()
|
|
108
|
-
except OSError as e:
|
|
109
|
-
logger.error(f"Process {self.process_id} failed to cleanup files for cache {cache_id}: {str(e)}")
|
|
110
|
-
|
|
111
|
-
def _create_new_cache_slot(self) -> int:
|
|
112
|
-
"""Create a new cache slot with proper locking."""
|
|
113
|
-
logger.debug(f"Process {self.process_id} creating new cache slot")
|
|
114
|
-
with filelock.FileLock(str(self.global_lock_file), timeout=10):
|
|
115
|
-
# Find the next available slot
|
|
116
|
-
existing_caches = set()
|
|
117
|
-
for cache_file in self.coordinator_dir.glob("cache_*.lock"):
|
|
118
|
-
try:
|
|
119
|
-
slot = int(cache_file.stem.split('_')[1])
|
|
120
|
-
existing_caches.add(slot)
|
|
121
|
-
except (ValueError, IndexError):
|
|
122
|
-
continue
|
|
123
|
-
|
|
124
|
-
# Get the first available slot number
|
|
125
|
-
new_slot = 0
|
|
126
|
-
while new_slot in existing_caches:
|
|
127
|
-
new_slot += 1
|
|
128
|
-
|
|
129
|
-
# Create the lock file for this slot
|
|
130
|
-
lock_file = self.coordinator_dir / f"cache_{new_slot}.lock"
|
|
131
|
-
lock_file.touch()
|
|
132
|
-
logger.info(f"Process {self.process_id} created new cache slot {new_slot}")
|
|
133
|
-
|
|
134
|
-
return new_slot
|
|
135
|
-
|
|
136
|
-
def cleanup_process_resources(self):
|
|
137
|
-
"""Clean up resources when the process exits."""
|
|
138
|
-
logger.debug(f"Process {self.process_id} starting cleanup")
|
|
139
|
-
# Clean up any cache slots owned by this process
|
|
140
|
-
for pid_file in self.coordinator_dir.glob("*.pid"):
|
|
141
|
-
try:
|
|
142
|
-
with open(pid_file, 'r') as f:
|
|
143
|
-
pid = int(f.read().strip())
|
|
144
|
-
|
|
145
|
-
if pid == self.process_id:
|
|
146
|
-
cache_id = pid_file.stem.split('_')[1]
|
|
147
|
-
logger.info(f"Process {self.process_id} cleaning up cache slot {cache_id}")
|
|
148
|
-
|
|
149
|
-
# Remove the cache files
|
|
150
|
-
cache_path = self.base_path / f"airbyte_cache_{cache_id}"
|
|
151
|
-
lock_file = self.coordinator_dir / f"cache_{cache_id}.lock"
|
|
152
|
-
|
|
153
|
-
for file in [pid_file, lock_file, cache_path]:
|
|
154
|
-
try:
|
|
155
|
-
if file.exists():
|
|
156
|
-
file.unlink()
|
|
157
|
-
logger.debug(f"Process {self.process_id} removed file: {file}")
|
|
158
|
-
except OSError as e:
|
|
159
|
-
logger.error(f"Process {self.process_id} failed to remove file {file}: {str(e)}")
|
|
160
|
-
except (ValueError, OSError) as e:
|
|
161
|
-
logger.error(f"Process {self.process_id} encountered error during cleanup: {str(e)}")
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
from ast import literal_eval
|
|
2
|
-
import polars as pl
|
|
3
|
-
from typing import Optional
|
|
4
|
-
import airbyte as ab
|
|
5
|
-
import os
|
|
6
|
-
from airbyte import exceptions as exc
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from flowfile_worker.configs import logger
|
|
10
|
-
from flowfile_worker.external_sources.airbyte_sources.models import AirbyteSettings
|
|
11
|
-
from flowfile_worker.external_sources.airbyte_sources.cache_manager import DuckDBCacheManager
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class AirbyteGetter:
|
|
15
|
-
stream: str
|
|
16
|
-
source_name: str
|
|
17
|
-
_type: str
|
|
18
|
-
_airbyte_module = None
|
|
19
|
-
_enforce_full_refresh: Optional[bool] = True
|
|
20
|
-
version: Optional[str] = None
|
|
21
|
-
|
|
22
|
-
def __init__(self, airbyte_settings: AirbyteSettings):
|
|
23
|
-
self._airbyte_response = None
|
|
24
|
-
self.stream = airbyte_settings.stream
|
|
25
|
-
self.source_name = airbyte_settings.source_name
|
|
26
|
-
self._enforce_full_refresh = airbyte_settings.enforce_full_refresh
|
|
27
|
-
self.cache_manager = DuckDBCacheManager()
|
|
28
|
-
if hasattr(airbyte_settings, 'version'):
|
|
29
|
-
self.version = airbyte_settings.version
|
|
30
|
-
|
|
31
|
-
# Handle config
|
|
32
|
-
if airbyte_settings.config_ref and not airbyte_settings.config:
|
|
33
|
-
logger.info(f"Getting config from {airbyte_settings.config_ref}")
|
|
34
|
-
config = literal_eval(os.environ.get(airbyte_settings.config_ref))
|
|
35
|
-
else:
|
|
36
|
-
logger.info(f"Using provided config")
|
|
37
|
-
config = airbyte_settings.config
|
|
38
|
-
|
|
39
|
-
if config is None:
|
|
40
|
-
raise ValueError("Config must be provided")
|
|
41
|
-
|
|
42
|
-
self.config = config
|
|
43
|
-
self._type = 'airbyte'
|
|
44
|
-
self.read_result = None
|
|
45
|
-
|
|
46
|
-
def __call__(self) -> pl.DataFrame:
|
|
47
|
-
with self.cache_manager.get_cache() as cache:
|
|
48
|
-
if self.read_result is None:
|
|
49
|
-
# Lazy import airbyte
|
|
50
|
-
|
|
51
|
-
try:
|
|
52
|
-
source = ab.get_source(
|
|
53
|
-
name=self.source_name,
|
|
54
|
-
config=self.config,
|
|
55
|
-
streams=self.stream,
|
|
56
|
-
docker_image=True,
|
|
57
|
-
version=self.version
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
logger.debug(f'Starting to load data for {self.stream}') # Changed to debug level
|
|
61
|
-
|
|
62
|
-
self.read_result = source.read(
|
|
63
|
-
cache=cache,
|
|
64
|
-
force_full_refresh=self._enforce_full_refresh
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
except Exception as e:
|
|
68
|
-
logger.error(f"Error during source operation: {str(e)}")
|
|
69
|
-
raise
|
|
70
|
-
|
|
71
|
-
df = self.read_result[self.stream].to_pandas()
|
|
72
|
-
drop_cols = [c for c in df.columns if c.startswith('_airbyte')]
|
|
73
|
-
df.drop(drop_cols, axis=1, inplace=True)
|
|
74
|
-
return pl.from_pandas(df)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def read_airbyte_source(airbyte_settings: AirbyteSettings) -> pl.DataFrame:
|
|
78
|
-
"""
|
|
79
|
-
Read data from an Airbyte source and return it as a Polars DataFrame.
|
|
80
|
-
Args:
|
|
81
|
-
airbyte_settings (): The settings for the Airbyte source.
|
|
82
|
-
|
|
83
|
-
Returns: The data as a Polars DataFrame.
|
|
84
|
-
"""
|
|
85
|
-
airbyte_getter = AirbyteGetter(airbyte_settings)
|
|
86
|
-
logger.info('Getting data from Airbyte')
|
|
87
|
-
data = airbyte_getter()
|
|
88
|
-
logger.info('Data retrieved from Airbyte')
|
|
89
|
-
return data
|
|
@@ -1,133 +0,0 @@
|
|
|
1
|
-
from typing import Any, Dict, List, Optional, TYPE_CHECKING, Union
|
|
2
|
-
from pydantic import BaseModel, field_validator, ConfigDict
|
|
3
|
-
from flowfile_worker.configs import logger
|
|
4
|
-
|
|
5
|
-
# Use TYPE_CHECKING to avoid circular imports
|
|
6
|
-
if TYPE_CHECKING:
|
|
7
|
-
from airbyte import Source
|
|
8
|
-
else:
|
|
9
|
-
Source = Any
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class LazyAirbyteSource:
|
|
13
|
-
"""Lazy wrapper for airbyte Source class."""
|
|
14
|
-
_source_class = None
|
|
15
|
-
|
|
16
|
-
@classmethod
|
|
17
|
-
def get_source_class(cls):
|
|
18
|
-
if cls._source_class is None:
|
|
19
|
-
logger.info("Importing airbyte Source class")
|
|
20
|
-
from airbyte import Source
|
|
21
|
-
cls._source_class = Source
|
|
22
|
-
return cls._source_class
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class JsonSchema(BaseModel):
|
|
26
|
-
type: Optional[Union[str, List[str]]]
|
|
27
|
-
airbyte_type: Optional[Union[str, List[str]]] = None
|
|
28
|
-
format: Optional[str] = None
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class AirbyteProperty(BaseModel):
|
|
32
|
-
name: str
|
|
33
|
-
json_schema: JsonSchema
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class AirbyteResponse(BaseModel):
|
|
37
|
-
source: Any # Using Any to avoid direct Source import
|
|
38
|
-
properties: list[AirbyteProperty]
|
|
39
|
-
|
|
40
|
-
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
41
|
-
|
|
42
|
-
@field_validator('source')
|
|
43
|
-
@classmethod
|
|
44
|
-
def validate_source(cls, v: Any) -> Any:
|
|
45
|
-
source_class = LazyAirbyteSource.get_source_class()
|
|
46
|
-
if not isinstance(v, source_class):
|
|
47
|
-
raise ValueError(f"Source must be an instance of airbyte.Source, got {type(v)}")
|
|
48
|
-
return v
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class GenericProperties(BaseModel):
|
|
52
|
-
type: str
|
|
53
|
-
title: Optional[str] = None
|
|
54
|
-
description: Optional[str] = None
|
|
55
|
-
order: Optional[int] = None
|
|
56
|
-
required: Optional[List[str]] = None
|
|
57
|
-
airbyte_secret: Optional[bool] = None
|
|
58
|
-
pattern: Optional[str] = None
|
|
59
|
-
pattern_descriptor: Optional[str] = None
|
|
60
|
-
format: Optional[str] = None
|
|
61
|
-
examples: Optional[List[Any]] = None
|
|
62
|
-
enum: Optional[List[str]] = None
|
|
63
|
-
minimum: Optional[float] = None
|
|
64
|
-
maximum: Optional[float] = None
|
|
65
|
-
items: Optional[Any] = None
|
|
66
|
-
properties: Optional[Dict[str, Any]] = None
|
|
67
|
-
|
|
68
|
-
@field_validator('items', 'properties')
|
|
69
|
-
@classmethod
|
|
70
|
-
def validate_nested(cls, value: Any) -> Any:
|
|
71
|
-
if isinstance(value, dict):
|
|
72
|
-
if 'type' in value:
|
|
73
|
-
return GenericProperties(**value)
|
|
74
|
-
return {k: GenericProperties(**v) if isinstance(v, dict) else v for k, v in value.items()}
|
|
75
|
-
return value
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
class GenericSchema(BaseModel):
|
|
79
|
-
title: str
|
|
80
|
-
type: str
|
|
81
|
-
required: Optional[List[str]] = None
|
|
82
|
-
additionalProperties: Optional[bool] = None
|
|
83
|
-
properties: Dict[str, GenericProperties]
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
class FieldProperty(BaseModel):
|
|
87
|
-
title: Optional[str] = None
|
|
88
|
-
type: str
|
|
89
|
-
key: str
|
|
90
|
-
description: Optional[str] = None
|
|
91
|
-
airbyte_secret: Optional[bool] = None
|
|
92
|
-
input_value: Optional[str] = None
|
|
93
|
-
default: Any
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
class OverallFieldProperty(BaseModel):
|
|
97
|
-
title: Optional[str] = None
|
|
98
|
-
type: str
|
|
99
|
-
key: str
|
|
100
|
-
required: bool
|
|
101
|
-
properties: List[FieldProperty]
|
|
102
|
-
items: Optional[List[FieldProperty]]
|
|
103
|
-
isOpen: bool
|
|
104
|
-
description: Optional[str] = None
|
|
105
|
-
input_value: Optional[str] = None
|
|
106
|
-
airbyte_secret: Optional[bool] = None
|
|
107
|
-
default: Any
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
class AirbyteConfigTemplate(BaseModel):
|
|
111
|
-
source_name: str
|
|
112
|
-
docs_url: Optional[str] = None
|
|
113
|
-
config_spec: Dict
|
|
114
|
-
available_streams: Optional[List[str]] = None
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
class AirbyteSettings(BaseModel):
|
|
118
|
-
|
|
119
|
-
source_name: str
|
|
120
|
-
stream: str
|
|
121
|
-
config_ref: Optional[str] = None
|
|
122
|
-
config: Optional[Dict] = None
|
|
123
|
-
fields: Optional[List] = None
|
|
124
|
-
enforce_full_refresh: Optional[bool] = True
|
|
125
|
-
flowfile_flow_id: int = 1
|
|
126
|
-
flowfile_node_id: int | str = -1
|
|
127
|
-
version: Optional[str] = None
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
def get_source_instance(*args, **kwargs) -> 'Source':
|
|
131
|
-
"""Helper function to get a Source instance with lazy loading."""
|
|
132
|
-
source_class = LazyAirbyteSource.get_source_class()
|
|
133
|
-
return source_class(*args, **kwargs)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{flowfile_core/schemas/external_sources → flowfile_worker/external_sources/s3_source}/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|