Flowfile 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of Flowfile might be problematic. Click here for more details.
- flowfile/__init__.py +3 -3
- flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
- flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
- flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
- flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
- flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
- flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
- flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
- flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
- flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
- flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
- flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
- flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
- flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
- flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
- flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
- flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
- flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
- flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
- flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
- flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
- flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
- flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
- flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
- flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
- flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
- flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
- flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
- flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
- flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
- flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
- flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
- flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
- flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
- flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
- flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
- flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
- flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
- flowfile/web/static/assets/api-fb67319c.js +80 -0
- flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
- flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
- flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
- flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
- flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
- flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
- flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
- flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
- flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/RECORD +108 -103
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
- flowfile_core/__init__.py +2 -0
- flowfile_core/configs/node_store/nodes.py +8 -6
- flowfile_core/database/connection.py +63 -15
- flowfile_core/database/init_db.py +0 -1
- flowfile_core/database/models.py +49 -2
- flowfile_core/flowfile/code_generator/code_generator.py +401 -17
- flowfile_core/flowfile/connection_manager/models.py +1 -1
- flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
- flowfile_core/flowfile/extensions.py +1 -1
- flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
- flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
- flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
- flowfile_core/flowfile/flow_graph.py +119 -82
- flowfile_core/flowfile/flow_node/flow_node.py +68 -33
- flowfile_core/flowfile/flow_node/models.py +32 -3
- flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
- flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
- flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
- flowfile_core/flowfile/utils.py +1 -23
- flowfile_core/main.py +3 -2
- flowfile_core/routes/cloud_connections.py +81 -0
- flowfile_core/routes/logs.py +0 -1
- flowfile_core/routes/routes.py +3 -39
- flowfile_core/schemas/cloud_storage_schemas.py +215 -0
- flowfile_core/schemas/input_schema.py +37 -15
- flowfile_core/schemas/schemas.py +7 -2
- flowfile_core/schemas/transform_schema.py +97 -22
- flowfile_core/utils/utils.py +40 -1
- flowfile_core/utils/validate_setup.py +41 -0
- flowfile_frame/flow_frame.py +253 -102
- flowfile_frame/flow_frame_methods.py +13 -13
- flowfile_worker/external_sources/s3_source/main.py +216 -0
- flowfile_worker/external_sources/s3_source/models.py +142 -0
- flowfile_worker/funcs.py +51 -6
- flowfile_worker/models.py +22 -2
- flowfile_worker/routes.py +40 -38
- flowfile_worker/utils.py +1 -1
- test_utils/s3/commands.py +46 -0
- test_utils/s3/data_generator.py +291 -0
- test_utils/s3/fixtures.py +209 -0
- flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
- flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
- flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
- flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
- flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
- flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
- flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
- flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
- flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
- flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
- {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
- {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
- {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
|
@@ -1,33 +1,69 @@
|
|
|
1
|
-
|
|
2
1
|
from sqlalchemy import create_engine
|
|
3
2
|
from contextlib import contextmanager
|
|
4
3
|
from sqlalchemy.orm import sessionmaker
|
|
5
4
|
import os
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from flowfile_core.configs import logger
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_app_data_dir() -> Path:
|
|
11
|
+
"""Get the appropriate application data directory for the current platform."""
|
|
12
|
+
app_name = "Flowfile"
|
|
13
|
+
|
|
14
|
+
if sys.platform == "win32":
|
|
15
|
+
# Windows: C:\Users\{username}\AppData\Local\flowfile
|
|
16
|
+
base_dir = os.environ.get("LOCALAPPDATA")
|
|
17
|
+
if not base_dir:
|
|
18
|
+
base_dir = os.path.join(os.path.expanduser("~"), "AppData", "Local")
|
|
19
|
+
elif sys.platform == "darwin":
|
|
20
|
+
# macOS: ~/Library/Application Support/flowfile
|
|
21
|
+
base_dir = os.path.join(os.path.expanduser("~"), "Library", "Application Support")
|
|
22
|
+
else:
|
|
23
|
+
# Linux: ~/.local/share/flowfile or use XDG_DATA_HOME
|
|
24
|
+
base_dir = os.environ.get("XDG_DATA_HOME")
|
|
25
|
+
if not base_dir:
|
|
26
|
+
base_dir = os.path.join(os.path.expanduser("~"), ".local", "share")
|
|
27
|
+
|
|
28
|
+
app_dir = Path(base_dir) / app_name
|
|
29
|
+
|
|
30
|
+
print(f"Using application data directory: {app_dir}")
|
|
31
|
+
app_dir.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
|
|
33
|
+
return app_dir
|
|
6
34
|
|
|
7
35
|
|
|
8
36
|
def get_database_url():
|
|
37
|
+
"""Get the database URL based on the current environment."""
|
|
9
38
|
if os.environ.get("TESTING") == "True":
|
|
10
|
-
# Use a
|
|
39
|
+
# Use a temporary test database
|
|
11
40
|
test_db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../test_flowfile.db")
|
|
12
41
|
return f"sqlite:///{test_db_path}"
|
|
13
|
-
# elif os.environ.get("FLOWFILE_MODE") == "electron":
|
|
14
42
|
|
|
15
|
-
|
|
43
|
+
custom_db_path = os.environ.get("FLOWFILE_DB_PATH")
|
|
44
|
+
if custom_db_path:
|
|
45
|
+
# logger.error("Using database URL:", os.environ.get("FLOWFILE_DB_URL"))
|
|
46
|
+
return f"sqlite:///{custom_db_path}"
|
|
47
|
+
# Use centralized location
|
|
48
|
+
app_dir = get_app_data_dir()
|
|
49
|
+
|
|
50
|
+
db_path = app_dir / "flowfile.db"
|
|
51
|
+
logger.info(f"Using database URL: sqlite:///{db_path}")
|
|
52
|
+
return f"sqlite:///{db_path}"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_database_path() -> Path:
|
|
56
|
+
"""Get the actual path to the database file (useful for backup/info purposes)."""
|
|
57
|
+
url = get_database_url()
|
|
58
|
+
if url.startswith("sqlite:///"):
|
|
59
|
+
return Path(url.replace("sqlite:///", ""))
|
|
60
|
+
return None
|
|
16
61
|
|
|
17
|
-
# else:
|
|
18
|
-
# # Use PostgreSQL for Docker mode
|
|
19
|
-
# host = os.environ.get("DB_HOST", "localhost")
|
|
20
|
-
# port = os.environ.get("DB_PORT", "5432")
|
|
21
|
-
# user = os.environ.get("DB_USER", "postgres")
|
|
22
|
-
# password = os.environ.get("DB_PASSWORD", "postgres")
|
|
23
|
-
# db = os.environ.get("DB_NAME", "flowfile")
|
|
24
|
-
# return f"postgresql://{user}:{password}@{host}:{port}/{db}"
|
|
25
|
-
#
|
|
26
62
|
|
|
27
63
|
# Create database engine
|
|
28
64
|
engine = create_engine(
|
|
29
65
|
get_database_url(),
|
|
30
|
-
connect_args={"check_same_thread": False} if
|
|
66
|
+
connect_args={"check_same_thread": False} if "sqlite" in get_database_url() else {}
|
|
31
67
|
)
|
|
32
68
|
|
|
33
69
|
# Create session factory
|
|
@@ -35,6 +71,7 @@ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
|
35
71
|
|
|
36
72
|
|
|
37
73
|
def get_db():
|
|
74
|
+
"""Dependency for FastAPI to get database session."""
|
|
38
75
|
db = SessionLocal()
|
|
39
76
|
try:
|
|
40
77
|
yield db
|
|
@@ -44,8 +81,19 @@ def get_db():
|
|
|
44
81
|
|
|
45
82
|
@contextmanager
|
|
46
83
|
def get_db_context():
|
|
84
|
+
"""Context manager for getting database session."""
|
|
47
85
|
db = SessionLocal()
|
|
48
86
|
try:
|
|
49
87
|
yield db
|
|
50
88
|
finally:
|
|
51
|
-
db.close()
|
|
89
|
+
db.close()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_database_info():
|
|
93
|
+
"""Get information about the current database configuration."""
|
|
94
|
+
return {
|
|
95
|
+
"url": get_database_url(),
|
|
96
|
+
"path": str(get_database_path()) if get_database_path() else None,
|
|
97
|
+
"app_data_dir": str(get_app_data_dir()),
|
|
98
|
+
"platform": sys.platform
|
|
99
|
+
}
|
|
@@ -13,7 +13,6 @@ db_models.Base.metadata.create_all(bind=engine)
|
|
|
13
13
|
|
|
14
14
|
def create_default_local_user(db: Session):
|
|
15
15
|
local_user = db.query(db_models.User).filter(db_models.User.username == "local_user").first()
|
|
16
|
-
|
|
17
16
|
if not local_user:
|
|
18
17
|
random_password = ''.join(secrets.choice(string.ascii_letters + string.digits) for _ in range(32))
|
|
19
18
|
hashed_password = pwd_context.hash(random_password)
|
flowfile_core/database/models.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
|
|
2
|
-
from sqlalchemy import Boolean, Column, ForeignKey, Integer, String, Text
|
|
1
|
+
from sqlalchemy import Boolean, Column, ForeignKey, Integer, String, Text, DateTime
|
|
3
2
|
from sqlalchemy.ext.declarative import declarative_base
|
|
3
|
+
from sqlalchemy.sql import func
|
|
4
4
|
|
|
5
5
|
Base = declarative_base()
|
|
6
6
|
|
|
@@ -39,3 +39,50 @@ class DatabaseConnection(Base):
|
|
|
39
39
|
ssl_enabled = Column(Boolean, default=False)
|
|
40
40
|
password_id = Column(Integer, ForeignKey("secrets.id"))
|
|
41
41
|
user_id = Column(Integer, ForeignKey("users.id"))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class CloudStorageConnection(Base):
|
|
45
|
+
__tablename__ = "cloud_storage_connections"
|
|
46
|
+
|
|
47
|
+
id = Column(Integer, primary_key=True, index=True)
|
|
48
|
+
connection_name = Column(String, index=True, nullable=False)
|
|
49
|
+
storage_type = Column(String, nullable=False) # 's3', 'adls', 'gcs'
|
|
50
|
+
auth_method = Column(String, nullable=False) # 'access_key', 'iam_role', etc.
|
|
51
|
+
|
|
52
|
+
# AWS S3 fields
|
|
53
|
+
aws_region = Column(String, nullable=True)
|
|
54
|
+
aws_access_key_id = Column(String, nullable=True)
|
|
55
|
+
aws_secret_access_key_id = Column(Integer, ForeignKey("secrets.id"), nullable=True)
|
|
56
|
+
aws_session_token_id = Column(Integer, ForeignKey("secrets.id"), nullable=True)
|
|
57
|
+
aws_role_arn = Column(String, nullable=True)
|
|
58
|
+
aws_allow_unsafe_html = Column(Boolean, nullable=True)
|
|
59
|
+
|
|
60
|
+
# Azure ADLS fields
|
|
61
|
+
azure_account_name = Column(String, nullable=True)
|
|
62
|
+
azure_account_key_id = Column(Integer, ForeignKey("secrets.id"), nullable=True)
|
|
63
|
+
azure_tenant_id = Column(String, nullable=True)
|
|
64
|
+
azure_client_id = Column(String, nullable=True)
|
|
65
|
+
azure_client_secret_id = Column(Integer, ForeignKey("secrets.id"), nullable=True)
|
|
66
|
+
azure_sas_token_id = Column(Integer, ForeignKey("secrets.id"), nullable=True)
|
|
67
|
+
|
|
68
|
+
# Common fields
|
|
69
|
+
endpoint_url = Column(String, nullable=True)
|
|
70
|
+
extra_config = Column(Text, nullable=True) # JSON field for additional config
|
|
71
|
+
verify_ssl = Column(Boolean, default=True)
|
|
72
|
+
|
|
73
|
+
# Metadata
|
|
74
|
+
user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
|
|
75
|
+
created_at = Column(DateTime, default=func.now(), nullable=False)
|
|
76
|
+
updated_at = Column(DateTime, default=func.now(), onupdate=func.now(), nullable=False)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class CloudStoragePermission(Base):
|
|
80
|
+
__tablename__ = "cloud_storage_permissions"
|
|
81
|
+
|
|
82
|
+
id = Column(Integer, primary_key=True, index=True)
|
|
83
|
+
connection_id = Column(Integer, ForeignKey("cloud_storage_connections.id"), nullable=False)
|
|
84
|
+
resource_path = Column(String, nullable=False) # e.g., "s3://bucket-name"
|
|
85
|
+
can_read = Column(Boolean, default=True)
|
|
86
|
+
can_write = Column(Boolean, default=False)
|
|
87
|
+
can_delete = Column(Boolean, default=False)
|
|
88
|
+
can_list = Column(Boolean, default=True)
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
from typing import List, Dict, Optional, Set, Tuple
|
|
2
|
-
from collections import defaultdict
|
|
1
|
+
from typing import List, Dict, Optional, Set, Tuple
|
|
3
2
|
import polars as pl
|
|
4
3
|
|
|
5
4
|
from flowfile_core.flowfile.flow_graph import FlowGraph
|
|
@@ -176,13 +175,10 @@ class FlowGraphToPolarsConverter:
|
|
|
176
175
|
|
|
177
176
|
def _handle_manual_input(self, settings: input_schema.NodeManualInput, var_name: str, input_vars: Dict[str, str]) -> None:
|
|
178
177
|
"""Handle manual data input nodes."""
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
self._add_code(f"{var_name} = pl.LazyFrame({data}, schema={schema}, strict=False)")
|
|
184
|
-
else:
|
|
185
|
-
self._add_code(f"{var_name} = pl.LazyFrame({settings.raw_data})")
|
|
178
|
+
data = settings.raw_data_format.data
|
|
179
|
+
flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in settings.raw_data_format.columns)
|
|
180
|
+
schema = self.get_manual_schema_input(flowfile_schema)
|
|
181
|
+
self._add_code(f"{var_name} = pl.LazyFrame({data}, schema={schema}, strict=False)")
|
|
186
182
|
self._add_code("")
|
|
187
183
|
|
|
188
184
|
def _handle_filter(self, settings: input_schema.NodeFilter, var_name: str, input_vars: Dict[str, str]) -> None:
|
|
@@ -247,22 +243,410 @@ class FlowGraphToPolarsConverter:
|
|
|
247
243
|
self._add_code("")
|
|
248
244
|
|
|
249
245
|
def _handle_join(self, settings: input_schema.NodeJoin, var_name: str, input_vars: Dict[str, str]) -> None:
|
|
250
|
-
"""Handle join nodes.
|
|
246
|
+
"""Handle join nodes by routing to appropriate join type handler.
|
|
247
|
+
|
|
248
|
+
This is the main entry point for processing join operations. It determines
|
|
249
|
+
the type of join and delegates to the appropriate handler method.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
settings: NodeJoin settings containing join configuration
|
|
253
|
+
var_name: Name of the variable to store the joined DataFrame
|
|
254
|
+
input_vars: Dictionary mapping input names to DataFrame variable names
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
None: Modifies internal state by adding generated code
|
|
258
|
+
"""
|
|
251
259
|
left_df = input_vars.get('main', input_vars.get('main_0', 'df_left'))
|
|
252
260
|
right_df = input_vars.get('right', input_vars.get('main_1', 'df_right'))
|
|
253
261
|
|
|
254
|
-
#
|
|
262
|
+
# Ensure left and right DataFrames are distinct
|
|
263
|
+
if left_df == right_df:
|
|
264
|
+
right_df = "df_right"
|
|
265
|
+
self._add_code(f"{right_df} = {left_df}")
|
|
266
|
+
|
|
267
|
+
if settings.join_input.how in ("semi", "anti"):
|
|
268
|
+
self._handle_semi_anti_join(settings, var_name, left_df, right_df)
|
|
269
|
+
else:
|
|
270
|
+
self._handle_standard_join(settings, var_name, left_df, right_df)
|
|
271
|
+
|
|
272
|
+
def _handle_semi_anti_join(self, settings: input_schema.NodeJoin, var_name: str, left_df: str,
|
|
273
|
+
right_df: str) -> None:
|
|
274
|
+
"""Handle semi and anti joins which only return rows from the left DataFrame.
|
|
275
|
+
|
|
276
|
+
Semi joins return rows from left DataFrame that have matches in right.
|
|
277
|
+
Anti joins return rows from left DataFrame that have no matches in right.
|
|
278
|
+
These joins are simpler as they don't require column management from right DataFrame.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
settings: NodeJoin settings containing join configuration
|
|
282
|
+
var_name: Name of the variable to store the result
|
|
283
|
+
left_df: Variable name of the left DataFrame
|
|
284
|
+
right_df: Variable name of the right DataFrame
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
None: Modifies internal state by adding generated code
|
|
288
|
+
"""
|
|
255
289
|
left_on = [jm.left_col for jm in settings.join_input.join_mapping]
|
|
256
290
|
right_on = [jm.right_col for jm in settings.join_input.join_mapping]
|
|
257
291
|
|
|
258
|
-
self._add_code(f"{var_name} = {left_df}.join(")
|
|
259
|
-
self._add_code(f"
|
|
260
|
-
self._add_code(f"
|
|
261
|
-
self._add_code(f"
|
|
262
|
-
self._add_code(f'
|
|
292
|
+
self._add_code(f"{var_name} = ({left_df}.join(")
|
|
293
|
+
self._add_code(f" {right_df},")
|
|
294
|
+
self._add_code(f" left_on={left_on},")
|
|
295
|
+
self._add_code(f" right_on={right_on},")
|
|
296
|
+
self._add_code(f' how="{settings.join_input.how}"')
|
|
297
|
+
self._add_code(" )")
|
|
263
298
|
self._add_code(")")
|
|
264
|
-
self._add_code("")
|
|
265
299
|
|
|
300
|
+
def _handle_standard_join(self, settings: input_schema.NodeJoin, var_name: str, left_df: str,
|
|
301
|
+
right_df: str) -> None:
|
|
302
|
+
"""Handle standard joins (left, right, inner, outer) with full column management.
|
|
303
|
+
|
|
304
|
+
Standard joins may include columns from both DataFrames and require careful
|
|
305
|
+
management of column names, duplicates, and transformations. This method
|
|
306
|
+
orchestrates the complete join process including pre/post transformations.
|
|
307
|
+
|
|
308
|
+
Process:
|
|
309
|
+
1. Auto-rename columns to avoid conflicts
|
|
310
|
+
2. Extract join keys
|
|
311
|
+
3. Apply pre-join transformations (renames, drops)
|
|
312
|
+
4. Handle join-specific key transformations
|
|
313
|
+
5. Execute join with post-processing
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
settings: NodeJoin settings containing join configuration
|
|
317
|
+
var_name: Name of the variable to store the result
|
|
318
|
+
left_df: Variable name of the left DataFrame
|
|
319
|
+
right_df: Variable name of the right DataFrame
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
None: Modifies internal state by adding generated code
|
|
323
|
+
"""
|
|
324
|
+
settings.join_input.auto_rename()
|
|
325
|
+
|
|
326
|
+
# Get join keys
|
|
327
|
+
left_on, right_on = self._get_join_keys(settings)
|
|
328
|
+
|
|
329
|
+
# Apply pre-join transformations
|
|
330
|
+
left_df, right_df = self._apply_pre_join_transformations(settings, left_df, right_df)
|
|
331
|
+
|
|
332
|
+
# Handle join-specific key transformations
|
|
333
|
+
left_on, right_on, reverse_action, after_join_drop_cols = self._handle_join_key_transformations(
|
|
334
|
+
settings, left_df, right_df, left_on, right_on
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Execute the join
|
|
338
|
+
self._execute_join_with_post_processing(
|
|
339
|
+
settings, var_name, left_df, right_df, left_on, right_on,
|
|
340
|
+
after_join_drop_cols, reverse_action
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
def _get_join_keys(self, settings: input_schema.NodeJoin) -> Tuple[List[str], List[str]]:
|
|
344
|
+
"""Extract join keys based on join type.
|
|
345
|
+
|
|
346
|
+
Different join types require different handling of join keys:
|
|
347
|
+
- For outer/right joins: Uses renamed column names for right DataFrame
|
|
348
|
+
- For other joins: Uses original column names from join mapping
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
settings: NodeJoin settings containing join configuration
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Tuple[List[str], List[str]]: Lists of (left_on, right_on) column names
|
|
355
|
+
"""
|
|
356
|
+
left_on = [jm.left_col for jm in settings.join_input.get_names_for_table_rename()]
|
|
357
|
+
|
|
358
|
+
if settings.join_input.how in ("outer", "right"):
|
|
359
|
+
right_on = [jm.right_col for jm in settings.join_input.get_names_for_table_rename()]
|
|
360
|
+
else:
|
|
361
|
+
right_on = [jm.right_col for jm in settings.join_input.join_mapping]
|
|
362
|
+
|
|
363
|
+
return left_on, right_on
|
|
364
|
+
|
|
365
|
+
def _apply_pre_join_transformations(self, settings: input_schema.NodeJoin, left_df: str, right_df: str) -> Tuple[
|
|
366
|
+
str, str]:
|
|
367
|
+
"""Apply column renames and drops before the join operation.
|
|
368
|
+
|
|
369
|
+
Pre-join transformations prepare DataFrames by:
|
|
370
|
+
- Renaming columns according to user specifications
|
|
371
|
+
- Dropping columns marked as not to keep (except join keys)
|
|
372
|
+
- Special handling for right/outer joins where join keys may need preservation
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
settings: NodeJoin settings containing column rename/drop specifications
|
|
376
|
+
left_df: Variable name of the left DataFrame
|
|
377
|
+
right_df: Variable name of the right DataFrame
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
Tuple[str, str]: The same DataFrame variable names (left_df, right_df)
|
|
381
|
+
Note: DataFrames are modified via generated code, not new variables
|
|
382
|
+
"""
|
|
383
|
+
# Calculate renames and drops
|
|
384
|
+
right_renames = {
|
|
385
|
+
column.old_name: column.new_name
|
|
386
|
+
for column in settings.join_input.right_select.renames
|
|
387
|
+
if
|
|
388
|
+
column.old_name != column.new_name and not column.join_key or settings.join_input.how in ("outer", "right")
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
left_renames = {
|
|
392
|
+
column.old_name: column.new_name
|
|
393
|
+
for column in settings.join_input.left_select.renames
|
|
394
|
+
if column.old_name != column.new_name
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
left_drop_columns = [
|
|
398
|
+
column.old_name for column in settings.join_input.left_select.renames
|
|
399
|
+
if not column.keep and not column.join_key
|
|
400
|
+
]
|
|
401
|
+
|
|
402
|
+
right_drop_columns = [
|
|
403
|
+
column.old_name for column in settings.join_input.right_select.renames
|
|
404
|
+
if not column.keep and not column.join_key
|
|
405
|
+
]
|
|
406
|
+
|
|
407
|
+
# Apply transformations
|
|
408
|
+
if right_renames:
|
|
409
|
+
self._add_code(f"{right_df} = {right_df}.rename({right_renames})")
|
|
410
|
+
if left_renames:
|
|
411
|
+
self._add_code(f"{left_df} = {left_df}.rename({left_renames})")
|
|
412
|
+
if left_drop_columns:
|
|
413
|
+
self._add_code(f"{left_df} = {left_df}.drop({left_drop_columns})")
|
|
414
|
+
if right_drop_columns:
|
|
415
|
+
self._add_code(f"{right_df} = {right_df}.drop({right_drop_columns})")
|
|
416
|
+
|
|
417
|
+
return left_df, right_df
|
|
418
|
+
|
|
419
|
+
def _handle_join_key_transformations(self, settings: input_schema.NodeJoin, left_df: str, right_df: str,
|
|
420
|
+
left_on: List[str], right_on: List[str]) \
|
|
421
|
+
-> Tuple[List[str], List[str], Optional[Dict], List[str]]:
|
|
422
|
+
"""Route to appropriate join-specific key transformation handler.
|
|
423
|
+
|
|
424
|
+
Different join types require different strategies for handling join keys
|
|
425
|
+
to avoid conflicts and preserve necessary columns.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
settings: NodeJoin settings containing join configuration
|
|
429
|
+
left_df: Variable name of the left DataFrame
|
|
430
|
+
right_df: Variable name of the right DataFrame
|
|
431
|
+
left_on: List of left DataFrame column names to join on
|
|
432
|
+
right_on: List of right DataFrame column names to join on
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
Tuple containing:
|
|
436
|
+
- left_on: Potentially modified list of left join columns
|
|
437
|
+
- right_on: Potentially modified list of right join columns
|
|
438
|
+
- reverse_action: Dictionary for renaming columns after join (or None)
|
|
439
|
+
- after_join_drop_cols: List of columns to drop after join
|
|
440
|
+
"""
|
|
441
|
+
join_type = settings.join_input.how
|
|
442
|
+
|
|
443
|
+
if join_type in ("left", "inner"):
|
|
444
|
+
return self._handle_left_inner_join_keys(settings, right_df, left_on, right_on)
|
|
445
|
+
elif join_type == "right":
|
|
446
|
+
return self._handle_right_join_keys(settings, left_df, left_on, right_on)
|
|
447
|
+
elif join_type == "outer":
|
|
448
|
+
return self._handle_outer_join_keys(settings, right_df, left_on, right_on)
|
|
449
|
+
else:
|
|
450
|
+
return left_on, right_on, None, []
|
|
451
|
+
|
|
452
|
+
def _handle_left_inner_join_keys(self, settings: input_schema.NodeJoin, right_df: str,
|
|
453
|
+
left_on: List[str], right_on: List[str]) -> Tuple[
|
|
454
|
+
List[str], List[str], Dict, List[str]]:
|
|
455
|
+
"""Handle key transformations for left and inner joins.
|
|
456
|
+
|
|
457
|
+
For left/inner joins:
|
|
458
|
+
- Join keys from left DataFrame are preserved
|
|
459
|
+
- Right DataFrame join keys are temporarily renamed with __DROP__ prefix
|
|
460
|
+
- After join, these temporary columns can be renamed back if needed
|
|
461
|
+
|
|
462
|
+
Args:
|
|
463
|
+
settings: NodeJoin settings containing join configuration
|
|
464
|
+
right_df: Variable name of the right DataFrame
|
|
465
|
+
left_on: List of left DataFrame column names to join on
|
|
466
|
+
right_on: List of right DataFrame column names to join on
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
Tuple containing:
|
|
470
|
+
- left_on: Unchanged left join columns
|
|
471
|
+
- right_on: Unchanged right join columns
|
|
472
|
+
- reverse_action: Mapping to rename __DROP__ columns after join
|
|
473
|
+
- after_join_drop_cols: Left join keys marked for dropping
|
|
474
|
+
"""
|
|
475
|
+
left_join_keys_to_keep = [jk.new_name for jk in settings.join_input.left_select.join_key_selects if jk.keep]
|
|
476
|
+
|
|
477
|
+
join_key_duplication_command = [
|
|
478
|
+
f'pl.col("{rjk.old_name}").alias("__DROP__{rjk.new_name}__DROP__")'
|
|
479
|
+
for rjk in settings.join_input.right_select.join_key_selects if rjk.keep
|
|
480
|
+
]
|
|
481
|
+
|
|
482
|
+
reverse_action = {
|
|
483
|
+
f"__DROP__{rjk.new_name}__DROP__": rjk.new_name
|
|
484
|
+
for rjk in settings.join_input.right_select.join_key_selects if rjk.keep
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
if join_key_duplication_command:
|
|
488
|
+
self._add_code(f"{right_df} = {right_df}.with_columns([{', '.join(join_key_duplication_command)}])")
|
|
489
|
+
|
|
490
|
+
after_join_drop_cols = [
|
|
491
|
+
k.new_name for k in settings.join_input.left_select.join_key_selects
|
|
492
|
+
if not k.keep
|
|
493
|
+
]
|
|
494
|
+
|
|
495
|
+
return left_on, right_on, reverse_action, after_join_drop_cols
|
|
496
|
+
|
|
497
|
+
def _handle_right_join_keys(self, settings: input_schema.NodeJoin, left_df: str,
|
|
498
|
+
left_on: List[str], right_on: List[str]) -> Tuple[
|
|
499
|
+
List[str], List[str], None, List[str]]:
|
|
500
|
+
"""Handle key transformations for right joins.
|
|
501
|
+
|
|
502
|
+
For right joins:
|
|
503
|
+
- Join keys from right DataFrame are preserved
|
|
504
|
+
- Left DataFrame join keys are prefixed with __jk_ to avoid conflicts
|
|
505
|
+
- Polars appends "_right" suffix to conflicting column names
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
settings: NodeJoin settings containing join configuration
|
|
509
|
+
left_df: Variable name of the left DataFrame
|
|
510
|
+
left_on: List of left DataFrame column names to join on
|
|
511
|
+
right_on: List of right DataFrame column names to join on
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
Tuple containing:
|
|
515
|
+
- left_on: Modified left join columns with __jk_ prefix where needed
|
|
516
|
+
- right_on: Unchanged right join columns
|
|
517
|
+
- reverse_action: None (no post-join renaming needed)
|
|
518
|
+
- after_join_drop_cols: Right join keys marked for dropping
|
|
519
|
+
"""
|
|
520
|
+
join_key_duplication_command = [
|
|
521
|
+
f'pl.col("{ljk.new_name}").alias("__jk_{ljk.new_name}")'
|
|
522
|
+
for ljk in settings.join_input.left_select.join_key_selects if ljk.keep
|
|
523
|
+
]
|
|
524
|
+
|
|
525
|
+
# Update left_on keys
|
|
526
|
+
for position, left_on_key in enumerate(left_on):
|
|
527
|
+
left_on_select = settings.join_input.left_select.get_select_input_on_new_name(left_on_key)
|
|
528
|
+
if left_on_select and left_on_select.keep:
|
|
529
|
+
left_on[position] = f"__jk_{left_on_select.new_name}"
|
|
530
|
+
|
|
531
|
+
if join_key_duplication_command:
|
|
532
|
+
self._add_code(f"{left_df} = {left_df}.with_columns([{', '.join(join_key_duplication_command)}])")
|
|
533
|
+
|
|
534
|
+
# Calculate columns to drop after join
|
|
535
|
+
left_join_keys_keep = {jk.new_name for jk in settings.join_input.left_select.join_key_selects if jk.keep}
|
|
536
|
+
after_join_drop_cols_right = [
|
|
537
|
+
jk.new_name if jk.new_name not in left_join_keys_keep else jk.new_name + "_right"
|
|
538
|
+
for jk in settings.join_input.right_select.join_key_selects if not jk.keep
|
|
539
|
+
]
|
|
540
|
+
after_join_drop_cols = list(set(after_join_drop_cols_right))
|
|
541
|
+
|
|
542
|
+
return left_on, right_on, None, after_join_drop_cols
|
|
543
|
+
|
|
544
|
+
def _handle_outer_join_keys(self, settings: input_schema.NodeJoin, right_df: str,
|
|
545
|
+
left_on: List[str], right_on: List[str]) -> Tuple[
|
|
546
|
+
List[str], List[str], Dict, List[str]]:
|
|
547
|
+
"""Handle key transformations for outer joins.
|
|
548
|
+
|
|
549
|
+
For outer joins:
|
|
550
|
+
- Both left and right join keys may need to be preserved
|
|
551
|
+
- Right DataFrame join keys are prefixed with __jk_ when they conflict
|
|
552
|
+
- Post-join renaming reverses the __jk_ prefix
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
settings: NodeJoin settings containing join configuration
|
|
556
|
+
right_df: Variable name of the right DataFrame
|
|
557
|
+
left_on: List of left DataFrame column names to join on
|
|
558
|
+
right_on: List of right DataFrame column names to join on
|
|
559
|
+
|
|
560
|
+
Returns:
|
|
561
|
+
Tuple containing:
|
|
562
|
+
- left_on: Unchanged left join columns
|
|
563
|
+
- right_on: Modified right join columns with __jk_ prefix where needed
|
|
564
|
+
- reverse_action: Mapping to remove __jk_ prefix after join
|
|
565
|
+
- after_join_drop_cols: Combined list of columns to drop from both sides
|
|
566
|
+
"""
|
|
567
|
+
left_join_keys = {jk.new_name for jk in settings.join_input.left_select.join_key_selects}
|
|
568
|
+
|
|
569
|
+
join_keys_to_keep_and_rename = [
|
|
570
|
+
rjk for rjk in settings.join_input.right_select.join_key_selects
|
|
571
|
+
if rjk.keep and rjk.new_name in left_join_keys
|
|
572
|
+
]
|
|
573
|
+
|
|
574
|
+
join_key_rename_command = {
|
|
575
|
+
rjk.new_name: f"__jk_{rjk.new_name}"
|
|
576
|
+
for rjk in join_keys_to_keep_and_rename
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
# Update right_on keys
|
|
580
|
+
for position, right_on_key in enumerate(right_on):
|
|
581
|
+
right_on_select = settings.join_input.right_select.get_select_input_on_new_name(right_on_key)
|
|
582
|
+
if right_on_select and right_on_select.keep and right_on_select.new_name in left_join_keys:
|
|
583
|
+
right_on[position] = f"__jk_{right_on_select.new_name}"
|
|
584
|
+
|
|
585
|
+
if join_key_rename_command:
|
|
586
|
+
self._add_code(f"{right_df} = {right_df}.rename({join_key_rename_command})")
|
|
587
|
+
|
|
588
|
+
reverse_action = {f"__jk_{rjk.new_name}": rjk.new_name for rjk in join_keys_to_keep_and_rename}
|
|
589
|
+
|
|
590
|
+
# Calculate columns to drop after join
|
|
591
|
+
after_join_drop_cols_left = [
|
|
592
|
+
jk.new_name for jk in settings.join_input.left_select.join_key_selects if not jk.keep
|
|
593
|
+
]
|
|
594
|
+
after_join_drop_cols_right = [
|
|
595
|
+
jk.new_name if jk.new_name not in left_join_keys else jk.new_name + "_right"
|
|
596
|
+
for jk in settings.join_input.right_select.join_key_selects if not jk.keep
|
|
597
|
+
]
|
|
598
|
+
after_join_drop_cols = after_join_drop_cols_left + after_join_drop_cols_right
|
|
599
|
+
|
|
600
|
+
return left_on, right_on, reverse_action, after_join_drop_cols
|
|
601
|
+
|
|
602
|
+
def _execute_join_with_post_processing(self, settings: input_schema.NodeJoin, var_name: str,
|
|
603
|
+
left_df: str, right_df: str, left_on: List[str], right_on: List[str],
|
|
604
|
+
after_join_drop_cols: List[str], reverse_action: Optional[Dict]) -> None:
|
|
605
|
+
"""Execute the join operation and apply post-processing steps.
|
|
606
|
+
|
|
607
|
+
Generates the actual join code with any necessary post-processing:
|
|
608
|
+
1. Executes the join operation
|
|
609
|
+
2. For right joins: Collects to eager mode (Polars requirement)
|
|
610
|
+
3. Drops unnecessary columns
|
|
611
|
+
4. Renames temporary columns back to final names
|
|
612
|
+
5. For right joins: Converts back to lazy mode
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
settings: NodeJoin settings containing join configuration
|
|
616
|
+
var_name: Name of the variable to store the result
|
|
617
|
+
left_df: Variable name of the left DataFrame
|
|
618
|
+
right_df: Variable name of the right DataFrame
|
|
619
|
+
left_on: List of left DataFrame column names to join on
|
|
620
|
+
right_on: List of right DataFrame column names to join on
|
|
621
|
+
after_join_drop_cols: List of columns to drop after join
|
|
622
|
+
reverse_action: Dictionary for renaming columns after join (or None)
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
None: Modifies internal state by adding generated code
|
|
626
|
+
"""
|
|
627
|
+
self._add_code(f"{var_name} = ({left_df}.join(")
|
|
628
|
+
self._add_code(f" {right_df},")
|
|
629
|
+
self._add_code(f" left_on={left_on},")
|
|
630
|
+
self._add_code(f" right_on={right_on},")
|
|
631
|
+
self._add_code(f' how="{settings.join_input.how}"')
|
|
632
|
+
self._add_code(" )")
|
|
633
|
+
|
|
634
|
+
# Handle right join special case
|
|
635
|
+
if settings.join_input.how == 'right':
|
|
636
|
+
self._add_code(".collect()") # Right join needs to be collected first cause of issue with rename
|
|
637
|
+
|
|
638
|
+
# Apply post-join transformations
|
|
639
|
+
if after_join_drop_cols:
|
|
640
|
+
self._add_code(f".drop({after_join_drop_cols})")
|
|
641
|
+
|
|
642
|
+
if reverse_action:
|
|
643
|
+
self._add_code(f".rename({reverse_action})")
|
|
644
|
+
|
|
645
|
+
# Convert back to lazy for right joins
|
|
646
|
+
if settings.join_input.how == 'right':
|
|
647
|
+
self._add_code(f".lazy()")
|
|
648
|
+
|
|
649
|
+
self._add_code(")")
|
|
266
650
|
def _handle_group_by(self, settings: input_schema.NodeGroupBy, var_name: str, input_vars: Dict[str, str]) -> None:
|
|
267
651
|
"""Handle group by nodes."""
|
|
268
652
|
input_df = input_vars.get('main', 'df')
|