Flowfile 0.3.5__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (121) hide show
  1. flowfile/__init__.py +3 -3
  2. flowfile/web/static/assets/CloudConnectionManager-2dfdce2f.css +86 -0
  3. flowfile/web/static/assets/CloudConnectionManager-d004942f.js +784 -0
  4. flowfile/web/static/assets/CloudStorageReader-29d14fcc.css +143 -0
  5. flowfile/web/static/assets/CloudStorageReader-eccf9fc2.js +437 -0
  6. flowfile/web/static/assets/CloudStorageWriter-49c9a4b2.css +138 -0
  7. flowfile/web/static/assets/CloudStorageWriter-b1ba6bba.js +430 -0
  8. flowfile/web/static/assets/{CrossJoin-dfcf7351.js → CrossJoin-68981877.js} +8 -8
  9. flowfile/web/static/assets/{DatabaseConnectionSettings-b2afb1d7.js → DatabaseConnectionSettings-0b06649c.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseManager-824a49b2.js → DatabaseManager-8349a426.js} +2 -2
  11. flowfile/web/static/assets/{DatabaseReader-a48124d8.js → DatabaseReader-905344f8.js} +9 -9
  12. flowfile/web/static/assets/{DatabaseWriter-b47cbae2.js → DatabaseWriter-9f5b8638.js} +9 -9
  13. flowfile/web/static/assets/{ExploreData-fdfc45a4.js → ExploreData-131a6d53.js} +5 -5
  14. flowfile/web/static/assets/{ExternalSource-861b0e71.js → ExternalSource-e3549dcc.js} +6 -6
  15. flowfile/web/static/assets/{Filter-f87bb897.js → Filter-6e0730ae.js} +8 -8
  16. flowfile/web/static/assets/{Formula-1e2ed720.js → Formula-02f033e6.js} +75 -9
  17. flowfile/web/static/assets/{Formula-b8cefc31.css → Formula-29f19d21.css} +10 -0
  18. flowfile/web/static/assets/{FuzzyMatch-b6cc4fdd.js → FuzzyMatch-54c14036.js} +9 -9
  19. flowfile/web/static/assets/{GraphSolver-6a371f4c.js → GraphSolver-08a3f499.js} +5 -5
  20. flowfile/web/static/assets/{GroupBy-f7b7f472.js → GroupBy-2ae38139.js} +6 -6
  21. flowfile/web/static/assets/{Join-eec38203.js → Join-493b9772.js} +23 -15
  22. flowfile/web/static/assets/{Join-41c0f331.css → Join-f45eff22.css} +20 -20
  23. flowfile/web/static/assets/{ManualInput-9aaa46fb.js → ManualInput-4373d163.js} +106 -34
  24. flowfile/web/static/assets/{ManualInput-ac7b9972.css → ManualInput-a71b52c6.css} +29 -17
  25. flowfile/web/static/assets/{Output-3b2ca045.js → Output-b534f3c7.js} +4 -4
  26. flowfile/web/static/assets/{Pivot-a4f5d88f.js → Pivot-2968ff65.js} +6 -6
  27. flowfile/web/static/assets/{PolarsCode-49ce444f.js → PolarsCode-65136536.js} +6 -6
  28. flowfile/web/static/assets/{Read-07acdc9a.js → Read-c56339ed.js} +6 -6
  29. flowfile/web/static/assets/{RecordCount-6a21da56.js → RecordCount-1c641a5e.js} +5 -5
  30. flowfile/web/static/assets/{RecordId-949bdc17.js → RecordId-df308b8f.js} +6 -6
  31. flowfile/web/static/assets/{Sample-7afca6e1.js → Sample-293e8a64.js} +5 -5
  32. flowfile/web/static/assets/{SecretManager-b41c029d.js → SecretManager-03911655.js} +2 -2
  33. flowfile/web/static/assets/{Select-32b28406.js → Select-3058a13d.js} +8 -8
  34. flowfile/web/static/assets/{SettingsSection-a0f15a05.js → SettingsSection-fbf4fb39.js} +1 -1
  35. flowfile/web/static/assets/{Sort-fc6ba0e2.js → Sort-a29bbaf7.js} +6 -6
  36. flowfile/web/static/assets/{TextToRows-23127596.js → TextToRows-c7d7760e.js} +8 -8
  37. flowfile/web/static/assets/{UnavailableFields-c42880a3.js → UnavailableFields-118f1d20.js} +2 -2
  38. flowfile/web/static/assets/{Union-39eecc6c.js → Union-f0589571.js} +5 -5
  39. flowfile/web/static/assets/{Unique-a0e8fe61.js → Unique-7329a207.js} +8 -8
  40. flowfile/web/static/assets/{Unpivot-1e2d43f0.js → Unpivot-30b0be15.js} +5 -5
  41. flowfile/web/static/assets/{api-44ca9e9c.js → api-602fb95c.js} +1 -1
  42. flowfile/web/static/assets/api-fb67319c.js +80 -0
  43. flowfile/web/static/assets/cloud_storage_reader-aa1415d6.png +0 -0
  44. flowfile/web/static/assets/{designer-267d44f1.js → designer-94a6bf4d.js} +36 -34
  45. flowfile/web/static/assets/{documentation-6c0810a2.js → documentation-a224831e.js} +1 -1
  46. flowfile/web/static/assets/{dropDown-52790b15.js → dropDown-c2d2aa97.js} +1 -1
  47. flowfile/web/static/assets/{fullEditor-e272b506.js → fullEditor-921ac5fd.js} +2 -2
  48. flowfile/web/static/assets/{genericNodeSettings-4bdcf98e.js → genericNodeSettings-7013cc94.js} +3 -3
  49. flowfile/web/static/assets/{index-e235a8bc.js → index-3a75211d.js} +19 -6
  50. flowfile/web/static/assets/{nodeTitle-fc3fc4b7.js → nodeTitle-a63d4680.js} +3 -3
  51. flowfile/web/static/assets/{secretApi-cdc2a3fd.js → secretApi-763aec6e.js} +1 -1
  52. flowfile/web/static/assets/{selectDynamic-96aa82cd.js → selectDynamic-08464729.js} +3 -3
  53. flowfile/web/static/assets/{vue-codemirror.esm-25e75a08.js → vue-codemirror.esm-f15a5f87.js} +2 -1
  54. flowfile/web/static/assets/{vue-content-loader.es-6c4b1c24.js → vue-content-loader.es-93bd09d7.js} +1 -1
  55. flowfile/web/static/index.html +1 -1
  56. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/METADATA +8 -3
  57. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/RECORD +108 -103
  58. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/entry_points.txt +2 -0
  59. flowfile_core/__init__.py +2 -0
  60. flowfile_core/configs/node_store/nodes.py +8 -6
  61. flowfile_core/database/connection.py +63 -15
  62. flowfile_core/database/init_db.py +0 -1
  63. flowfile_core/database/models.py +49 -2
  64. flowfile_core/flowfile/code_generator/code_generator.py +401 -17
  65. flowfile_core/flowfile/connection_manager/models.py +1 -1
  66. flowfile_core/flowfile/database_connection_manager/db_connections.py +216 -2
  67. flowfile_core/flowfile/extensions.py +1 -1
  68. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +259 -0
  69. flowfile_core/flowfile/flow_data_engine/create/funcs.py +19 -8
  70. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +522 -59
  71. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +12 -2
  72. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +1 -1
  73. flowfile_core/flowfile/flow_data_engine/join/__init__.py +2 -1
  74. flowfile_core/flowfile/flow_data_engine/join/utils.py +25 -0
  75. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +29 -22
  76. flowfile_core/flowfile/flow_data_engine/utils.py +1 -40
  77. flowfile_core/flowfile/flow_graph.py +119 -82
  78. flowfile_core/flowfile/flow_node/flow_node.py +68 -33
  79. flowfile_core/flowfile/flow_node/models.py +32 -3
  80. flowfile_core/flowfile/flow_node/schema_callback.py +3 -2
  81. flowfile_core/flowfile/sources/external_sources/__init__.py +0 -2
  82. flowfile_core/flowfile/sources/external_sources/factory.py +4 -7
  83. flowfile_core/flowfile/utils.py +1 -23
  84. flowfile_core/main.py +3 -2
  85. flowfile_core/routes/cloud_connections.py +81 -0
  86. flowfile_core/routes/logs.py +0 -1
  87. flowfile_core/routes/routes.py +3 -39
  88. flowfile_core/schemas/cloud_storage_schemas.py +215 -0
  89. flowfile_core/schemas/input_schema.py +37 -15
  90. flowfile_core/schemas/schemas.py +7 -2
  91. flowfile_core/schemas/transform_schema.py +97 -22
  92. flowfile_core/utils/utils.py +40 -1
  93. flowfile_core/utils/validate_setup.py +41 -0
  94. flowfile_frame/flow_frame.py +253 -102
  95. flowfile_frame/flow_frame_methods.py +13 -13
  96. flowfile_worker/external_sources/s3_source/main.py +216 -0
  97. flowfile_worker/external_sources/s3_source/models.py +142 -0
  98. flowfile_worker/funcs.py +51 -6
  99. flowfile_worker/models.py +22 -2
  100. flowfile_worker/routes.py +40 -38
  101. flowfile_worker/utils.py +1 -1
  102. test_utils/s3/commands.py +46 -0
  103. test_utils/s3/data_generator.py +291 -0
  104. test_utils/s3/fixtures.py +209 -0
  105. flowfile/web/static/assets/AirbyteReader-1ac35765.css +0 -314
  106. flowfile/web/static/assets/AirbyteReader-e08044e5.js +0 -922
  107. flowfile/web/static/assets/dropDownGeneric-60f56a8a.js +0 -72
  108. flowfile/web/static/assets/dropDownGeneric-895680d6.css +0 -10
  109. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +0 -159
  110. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +0 -172
  111. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +0 -173
  112. flowfile_core/schemas/external_sources/airbyte_schemas.py +0 -20
  113. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  114. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +0 -161
  115. flowfile_worker/external_sources/airbyte_sources/main.py +0 -89
  116. flowfile_worker/external_sources/airbyte_sources/models.py +0 -133
  117. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  118. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/LICENSE +0 -0
  119. {flowfile-0.3.5.dist-info → flowfile-0.3.6.dist-info}/WHEEL +0 -0
  120. {flowfile_core/flowfile/sources/external_sources/airbyte_sources → flowfile_worker/external_sources/s3_source}/__init__.py +0 -0
  121. {flowfile_core/schemas/external_sources → test_utils/s3}/__init__.py +0 -0
@@ -1,33 +1,69 @@
1
-
2
1
  from sqlalchemy import create_engine
3
2
  from contextlib import contextmanager
4
3
  from sqlalchemy.orm import sessionmaker
5
4
  import os
5
+ import sys
6
+ from pathlib import Path
7
+ from flowfile_core.configs import logger
8
+
9
+
10
+ def get_app_data_dir() -> Path:
11
+ """Get the appropriate application data directory for the current platform."""
12
+ app_name = "Flowfile"
13
+
14
+ if sys.platform == "win32":
15
+ # Windows: C:\Users\{username}\AppData\Local\flowfile
16
+ base_dir = os.environ.get("LOCALAPPDATA")
17
+ if not base_dir:
18
+ base_dir = os.path.join(os.path.expanduser("~"), "AppData", "Local")
19
+ elif sys.platform == "darwin":
20
+ # macOS: ~/Library/Application Support/flowfile
21
+ base_dir = os.path.join(os.path.expanduser("~"), "Library", "Application Support")
22
+ else:
23
+ # Linux: ~/.local/share/flowfile or use XDG_DATA_HOME
24
+ base_dir = os.environ.get("XDG_DATA_HOME")
25
+ if not base_dir:
26
+ base_dir = os.path.join(os.path.expanduser("~"), ".local", "share")
27
+
28
+ app_dir = Path(base_dir) / app_name
29
+
30
+ print(f"Using application data directory: {app_dir}")
31
+ app_dir.mkdir(parents=True, exist_ok=True)
32
+
33
+ return app_dir
6
34
 
7
35
 
8
36
  def get_database_url():
37
+ """Get the database URL based on the current environment."""
9
38
  if os.environ.get("TESTING") == "True":
10
- # Use a file-based test database instead of in-memory
39
+ # Use a temporary test database
11
40
  test_db_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../test_flowfile.db")
12
41
  return f"sqlite:///{test_db_path}"
13
- # elif os.environ.get("FLOWFILE_MODE") == "electron":
14
42
 
15
- return "sqlite:///./flowfile.db"
43
+ custom_db_path = os.environ.get("FLOWFILE_DB_PATH")
44
+ if custom_db_path:
45
+ # logger.error("Using database URL:", os.environ.get("FLOWFILE_DB_URL"))
46
+ return f"sqlite:///{custom_db_path}"
47
+ # Use centralized location
48
+ app_dir = get_app_data_dir()
49
+
50
+ db_path = app_dir / "flowfile.db"
51
+ logger.info(f"Using database URL: sqlite:///{db_path}")
52
+ return f"sqlite:///{db_path}"
53
+
54
+
55
+ def get_database_path() -> Path:
56
+ """Get the actual path to the database file (useful for backup/info purposes)."""
57
+ url = get_database_url()
58
+ if url.startswith("sqlite:///"):
59
+ return Path(url.replace("sqlite:///", ""))
60
+ return None
16
61
 
17
- # else:
18
- # # Use PostgreSQL for Docker mode
19
- # host = os.environ.get("DB_HOST", "localhost")
20
- # port = os.environ.get("DB_PORT", "5432")
21
- # user = os.environ.get("DB_USER", "postgres")
22
- # password = os.environ.get("DB_PASSWORD", "postgres")
23
- # db = os.environ.get("DB_NAME", "flowfile")
24
- # return f"postgresql://{user}:{password}@{host}:{port}/{db}"
25
- #
26
62
 
27
63
  # Create database engine
28
64
  engine = create_engine(
29
65
  get_database_url(),
30
- connect_args={"check_same_thread": False} if os.environ.get("FLOWFILE_MODE") == "electron" else {}
66
+ connect_args={"check_same_thread": False} if "sqlite" in get_database_url() else {}
31
67
  )
32
68
 
33
69
  # Create session factory
@@ -35,6 +71,7 @@ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
35
71
 
36
72
 
37
73
  def get_db():
74
+ """Dependency for FastAPI to get database session."""
38
75
  db = SessionLocal()
39
76
  try:
40
77
  yield db
@@ -44,8 +81,19 @@ def get_db():
44
81
 
45
82
  @contextmanager
46
83
  def get_db_context():
84
+ """Context manager for getting database session."""
47
85
  db = SessionLocal()
48
86
  try:
49
87
  yield db
50
88
  finally:
51
- db.close()
89
+ db.close()
90
+
91
+
92
+ def get_database_info():
93
+ """Get information about the current database configuration."""
94
+ return {
95
+ "url": get_database_url(),
96
+ "path": str(get_database_path()) if get_database_path() else None,
97
+ "app_data_dir": str(get_app_data_dir()),
98
+ "platform": sys.platform
99
+ }
@@ -13,7 +13,6 @@ db_models.Base.metadata.create_all(bind=engine)
13
13
 
14
14
  def create_default_local_user(db: Session):
15
15
  local_user = db.query(db_models.User).filter(db_models.User.username == "local_user").first()
16
-
17
16
  if not local_user:
18
17
  random_password = ''.join(secrets.choice(string.ascii_letters + string.digits) for _ in range(32))
19
18
  hashed_password = pwd_context.hash(random_password)
@@ -1,6 +1,6 @@
1
-
2
- from sqlalchemy import Boolean, Column, ForeignKey, Integer, String, Text
1
+ from sqlalchemy import Boolean, Column, ForeignKey, Integer, String, Text, DateTime
3
2
  from sqlalchemy.ext.declarative import declarative_base
3
+ from sqlalchemy.sql import func
4
4
 
5
5
  Base = declarative_base()
6
6
 
@@ -39,3 +39,50 @@ class DatabaseConnection(Base):
39
39
  ssl_enabled = Column(Boolean, default=False)
40
40
  password_id = Column(Integer, ForeignKey("secrets.id"))
41
41
  user_id = Column(Integer, ForeignKey("users.id"))
42
+
43
+
44
+ class CloudStorageConnection(Base):
45
+ __tablename__ = "cloud_storage_connections"
46
+
47
+ id = Column(Integer, primary_key=True, index=True)
48
+ connection_name = Column(String, index=True, nullable=False)
49
+ storage_type = Column(String, nullable=False) # 's3', 'adls', 'gcs'
50
+ auth_method = Column(String, nullable=False) # 'access_key', 'iam_role', etc.
51
+
52
+ # AWS S3 fields
53
+ aws_region = Column(String, nullable=True)
54
+ aws_access_key_id = Column(String, nullable=True)
55
+ aws_secret_access_key_id = Column(Integer, ForeignKey("secrets.id"), nullable=True)
56
+ aws_session_token_id = Column(Integer, ForeignKey("secrets.id"), nullable=True)
57
+ aws_role_arn = Column(String, nullable=True)
58
+ aws_allow_unsafe_html = Column(Boolean, nullable=True)
59
+
60
+ # Azure ADLS fields
61
+ azure_account_name = Column(String, nullable=True)
62
+ azure_account_key_id = Column(Integer, ForeignKey("secrets.id"), nullable=True)
63
+ azure_tenant_id = Column(String, nullable=True)
64
+ azure_client_id = Column(String, nullable=True)
65
+ azure_client_secret_id = Column(Integer, ForeignKey("secrets.id"), nullable=True)
66
+ azure_sas_token_id = Column(Integer, ForeignKey("secrets.id"), nullable=True)
67
+
68
+ # Common fields
69
+ endpoint_url = Column(String, nullable=True)
70
+ extra_config = Column(Text, nullable=True) # JSON field for additional config
71
+ verify_ssl = Column(Boolean, default=True)
72
+
73
+ # Metadata
74
+ user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
75
+ created_at = Column(DateTime, default=func.now(), nullable=False)
76
+ updated_at = Column(DateTime, default=func.now(), onupdate=func.now(), nullable=False)
77
+
78
+
79
+ class CloudStoragePermission(Base):
80
+ __tablename__ = "cloud_storage_permissions"
81
+
82
+ id = Column(Integer, primary_key=True, index=True)
83
+ connection_id = Column(Integer, ForeignKey("cloud_storage_connections.id"), nullable=False)
84
+ resource_path = Column(String, nullable=False) # e.g., "s3://bucket-name"
85
+ can_read = Column(Boolean, default=True)
86
+ can_write = Column(Boolean, default=False)
87
+ can_delete = Column(Boolean, default=False)
88
+ can_list = Column(Boolean, default=True)
@@ -1,5 +1,4 @@
1
- from typing import List, Dict, Optional, Set, Tuple, Any
2
- from collections import defaultdict
1
+ from typing import List, Dict, Optional, Set, Tuple
3
2
  import polars as pl
4
3
 
5
4
  from flowfile_core.flowfile.flow_graph import FlowGraph
@@ -176,13 +175,10 @@ class FlowGraphToPolarsConverter:
176
175
 
177
176
  def _handle_manual_input(self, settings: input_schema.NodeManualInput, var_name: str, input_vars: Dict[str, str]) -> None:
178
177
  """Handle manual data input nodes."""
179
- if settings.raw_data_format:
180
- data = settings.raw_data_format.data
181
- flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in settings.raw_data_format.columns)
182
- schema = self.get_manual_schema_input(flowfile_schema)
183
- self._add_code(f"{var_name} = pl.LazyFrame({data}, schema={schema}, strict=False)")
184
- else:
185
- self._add_code(f"{var_name} = pl.LazyFrame({settings.raw_data})")
178
+ data = settings.raw_data_format.data
179
+ flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in settings.raw_data_format.columns)
180
+ schema = self.get_manual_schema_input(flowfile_schema)
181
+ self._add_code(f"{var_name} = pl.LazyFrame({data}, schema={schema}, strict=False)")
186
182
  self._add_code("")
187
183
 
188
184
  def _handle_filter(self, settings: input_schema.NodeFilter, var_name: str, input_vars: Dict[str, str]) -> None:
@@ -247,22 +243,410 @@ class FlowGraphToPolarsConverter:
247
243
  self._add_code("")
248
244
 
249
245
  def _handle_join(self, settings: input_schema.NodeJoin, var_name: str, input_vars: Dict[str, str]) -> None:
250
- """Handle join nodes."""
246
+ """Handle join nodes by routing to appropriate join type handler.
247
+
248
+ This is the main entry point for processing join operations. It determines
249
+ the type of join and delegates to the appropriate handler method.
250
+
251
+ Args:
252
+ settings: NodeJoin settings containing join configuration
253
+ var_name: Name of the variable to store the joined DataFrame
254
+ input_vars: Dictionary mapping input names to DataFrame variable names
255
+
256
+ Returns:
257
+ None: Modifies internal state by adding generated code
258
+ """
251
259
  left_df = input_vars.get('main', input_vars.get('main_0', 'df_left'))
252
260
  right_df = input_vars.get('right', input_vars.get('main_1', 'df_right'))
253
261
 
254
- # Extract join keys
262
+ # Ensure left and right DataFrames are distinct
263
+ if left_df == right_df:
264
+ right_df = "df_right"
265
+ self._add_code(f"{right_df} = {left_df}")
266
+
267
+ if settings.join_input.how in ("semi", "anti"):
268
+ self._handle_semi_anti_join(settings, var_name, left_df, right_df)
269
+ else:
270
+ self._handle_standard_join(settings, var_name, left_df, right_df)
271
+
272
+ def _handle_semi_anti_join(self, settings: input_schema.NodeJoin, var_name: str, left_df: str,
273
+ right_df: str) -> None:
274
+ """Handle semi and anti joins which only return rows from the left DataFrame.
275
+
276
+ Semi joins return rows from left DataFrame that have matches in right.
277
+ Anti joins return rows from left DataFrame that have no matches in right.
278
+ These joins are simpler as they don't require column management from right DataFrame.
279
+
280
+ Args:
281
+ settings: NodeJoin settings containing join configuration
282
+ var_name: Name of the variable to store the result
283
+ left_df: Variable name of the left DataFrame
284
+ right_df: Variable name of the right DataFrame
285
+
286
+ Returns:
287
+ None: Modifies internal state by adding generated code
288
+ """
255
289
  left_on = [jm.left_col for jm in settings.join_input.join_mapping]
256
290
  right_on = [jm.right_col for jm in settings.join_input.join_mapping]
257
291
 
258
- self._add_code(f"{var_name} = {left_df}.join(")
259
- self._add_code(f" {right_df},")
260
- self._add_code(f" left_on={left_on},")
261
- self._add_code(f" right_on={right_on},")
262
- self._add_code(f' how="{settings.join_input.how}"')
292
+ self._add_code(f"{var_name} = ({left_df}.join(")
293
+ self._add_code(f" {right_df},")
294
+ self._add_code(f" left_on={left_on},")
295
+ self._add_code(f" right_on={right_on},")
296
+ self._add_code(f' how="{settings.join_input.how}"')
297
+ self._add_code(" )")
263
298
  self._add_code(")")
264
- self._add_code("")
265
299
 
300
+ def _handle_standard_join(self, settings: input_schema.NodeJoin, var_name: str, left_df: str,
301
+ right_df: str) -> None:
302
+ """Handle standard joins (left, right, inner, outer) with full column management.
303
+
304
+ Standard joins may include columns from both DataFrames and require careful
305
+ management of column names, duplicates, and transformations. This method
306
+ orchestrates the complete join process including pre/post transformations.
307
+
308
+ Process:
309
+ 1. Auto-rename columns to avoid conflicts
310
+ 2. Extract join keys
311
+ 3. Apply pre-join transformations (renames, drops)
312
+ 4. Handle join-specific key transformations
313
+ 5. Execute join with post-processing
314
+
315
+ Args:
316
+ settings: NodeJoin settings containing join configuration
317
+ var_name: Name of the variable to store the result
318
+ left_df: Variable name of the left DataFrame
319
+ right_df: Variable name of the right DataFrame
320
+
321
+ Returns:
322
+ None: Modifies internal state by adding generated code
323
+ """
324
+ settings.join_input.auto_rename()
325
+
326
+ # Get join keys
327
+ left_on, right_on = self._get_join_keys(settings)
328
+
329
+ # Apply pre-join transformations
330
+ left_df, right_df = self._apply_pre_join_transformations(settings, left_df, right_df)
331
+
332
+ # Handle join-specific key transformations
333
+ left_on, right_on, reverse_action, after_join_drop_cols = self._handle_join_key_transformations(
334
+ settings, left_df, right_df, left_on, right_on
335
+ )
336
+
337
+ # Execute the join
338
+ self._execute_join_with_post_processing(
339
+ settings, var_name, left_df, right_df, left_on, right_on,
340
+ after_join_drop_cols, reverse_action
341
+ )
342
+
343
+ def _get_join_keys(self, settings: input_schema.NodeJoin) -> Tuple[List[str], List[str]]:
344
+ """Extract join keys based on join type.
345
+
346
+ Different join types require different handling of join keys:
347
+ - For outer/right joins: Uses renamed column names for right DataFrame
348
+ - For other joins: Uses original column names from join mapping
349
+
350
+ Args:
351
+ settings: NodeJoin settings containing join configuration
352
+
353
+ Returns:
354
+ Tuple[List[str], List[str]]: Lists of (left_on, right_on) column names
355
+ """
356
+ left_on = [jm.left_col for jm in settings.join_input.get_names_for_table_rename()]
357
+
358
+ if settings.join_input.how in ("outer", "right"):
359
+ right_on = [jm.right_col for jm in settings.join_input.get_names_for_table_rename()]
360
+ else:
361
+ right_on = [jm.right_col for jm in settings.join_input.join_mapping]
362
+
363
+ return left_on, right_on
364
+
365
+ def _apply_pre_join_transformations(self, settings: input_schema.NodeJoin, left_df: str, right_df: str) -> Tuple[
366
+ str, str]:
367
+ """Apply column renames and drops before the join operation.
368
+
369
+ Pre-join transformations prepare DataFrames by:
370
+ - Renaming columns according to user specifications
371
+ - Dropping columns marked as not to keep (except join keys)
372
+ - Special handling for right/outer joins where join keys may need preservation
373
+
374
+ Args:
375
+ settings: NodeJoin settings containing column rename/drop specifications
376
+ left_df: Variable name of the left DataFrame
377
+ right_df: Variable name of the right DataFrame
378
+
379
+ Returns:
380
+ Tuple[str, str]: The same DataFrame variable names (left_df, right_df)
381
+ Note: DataFrames are modified via generated code, not new variables
382
+ """
383
+ # Calculate renames and drops
384
+ right_renames = {
385
+ column.old_name: column.new_name
386
+ for column in settings.join_input.right_select.renames
387
+ if
388
+ column.old_name != column.new_name and not column.join_key or settings.join_input.how in ("outer", "right")
389
+ }
390
+
391
+ left_renames = {
392
+ column.old_name: column.new_name
393
+ for column in settings.join_input.left_select.renames
394
+ if column.old_name != column.new_name
395
+ }
396
+
397
+ left_drop_columns = [
398
+ column.old_name for column in settings.join_input.left_select.renames
399
+ if not column.keep and not column.join_key
400
+ ]
401
+
402
+ right_drop_columns = [
403
+ column.old_name for column in settings.join_input.right_select.renames
404
+ if not column.keep and not column.join_key
405
+ ]
406
+
407
+ # Apply transformations
408
+ if right_renames:
409
+ self._add_code(f"{right_df} = {right_df}.rename({right_renames})")
410
+ if left_renames:
411
+ self._add_code(f"{left_df} = {left_df}.rename({left_renames})")
412
+ if left_drop_columns:
413
+ self._add_code(f"{left_df} = {left_df}.drop({left_drop_columns})")
414
+ if right_drop_columns:
415
+ self._add_code(f"{right_df} = {right_df}.drop({right_drop_columns})")
416
+
417
+ return left_df, right_df
418
+
419
+ def _handle_join_key_transformations(self, settings: input_schema.NodeJoin, left_df: str, right_df: str,
420
+ left_on: List[str], right_on: List[str]) \
421
+ -> Tuple[List[str], List[str], Optional[Dict], List[str]]:
422
+ """Route to appropriate join-specific key transformation handler.
423
+
424
+ Different join types require different strategies for handling join keys
425
+ to avoid conflicts and preserve necessary columns.
426
+
427
+ Args:
428
+ settings: NodeJoin settings containing join configuration
429
+ left_df: Variable name of the left DataFrame
430
+ right_df: Variable name of the right DataFrame
431
+ left_on: List of left DataFrame column names to join on
432
+ right_on: List of right DataFrame column names to join on
433
+
434
+ Returns:
435
+ Tuple containing:
436
+ - left_on: Potentially modified list of left join columns
437
+ - right_on: Potentially modified list of right join columns
438
+ - reverse_action: Dictionary for renaming columns after join (or None)
439
+ - after_join_drop_cols: List of columns to drop after join
440
+ """
441
+ join_type = settings.join_input.how
442
+
443
+ if join_type in ("left", "inner"):
444
+ return self._handle_left_inner_join_keys(settings, right_df, left_on, right_on)
445
+ elif join_type == "right":
446
+ return self._handle_right_join_keys(settings, left_df, left_on, right_on)
447
+ elif join_type == "outer":
448
+ return self._handle_outer_join_keys(settings, right_df, left_on, right_on)
449
+ else:
450
+ return left_on, right_on, None, []
451
+
452
+ def _handle_left_inner_join_keys(self, settings: input_schema.NodeJoin, right_df: str,
453
+ left_on: List[str], right_on: List[str]) -> Tuple[
454
+ List[str], List[str], Dict, List[str]]:
455
+ """Handle key transformations for left and inner joins.
456
+
457
+ For left/inner joins:
458
+ - Join keys from left DataFrame are preserved
459
+ - Right DataFrame join keys are temporarily renamed with __DROP__ prefix
460
+ - After join, these temporary columns can be renamed back if needed
461
+
462
+ Args:
463
+ settings: NodeJoin settings containing join configuration
464
+ right_df: Variable name of the right DataFrame
465
+ left_on: List of left DataFrame column names to join on
466
+ right_on: List of right DataFrame column names to join on
467
+
468
+ Returns:
469
+ Tuple containing:
470
+ - left_on: Unchanged left join columns
471
+ - right_on: Unchanged right join columns
472
+ - reverse_action: Mapping to rename __DROP__ columns after join
473
+ - after_join_drop_cols: Left join keys marked for dropping
474
+ """
475
+ left_join_keys_to_keep = [jk.new_name for jk in settings.join_input.left_select.join_key_selects if jk.keep]
476
+
477
+ join_key_duplication_command = [
478
+ f'pl.col("{rjk.old_name}").alias("__DROP__{rjk.new_name}__DROP__")'
479
+ for rjk in settings.join_input.right_select.join_key_selects if rjk.keep
480
+ ]
481
+
482
+ reverse_action = {
483
+ f"__DROP__{rjk.new_name}__DROP__": rjk.new_name
484
+ for rjk in settings.join_input.right_select.join_key_selects if rjk.keep
485
+ }
486
+
487
+ if join_key_duplication_command:
488
+ self._add_code(f"{right_df} = {right_df}.with_columns([{', '.join(join_key_duplication_command)}])")
489
+
490
+ after_join_drop_cols = [
491
+ k.new_name for k in settings.join_input.left_select.join_key_selects
492
+ if not k.keep
493
+ ]
494
+
495
+ return left_on, right_on, reverse_action, after_join_drop_cols
496
+
497
+ def _handle_right_join_keys(self, settings: input_schema.NodeJoin, left_df: str,
498
+ left_on: List[str], right_on: List[str]) -> Tuple[
499
+ List[str], List[str], None, List[str]]:
500
+ """Handle key transformations for right joins.
501
+
502
+ For right joins:
503
+ - Join keys from right DataFrame are preserved
504
+ - Left DataFrame join keys are prefixed with __jk_ to avoid conflicts
505
+ - Polars appends "_right" suffix to conflicting column names
506
+
507
+ Args:
508
+ settings: NodeJoin settings containing join configuration
509
+ left_df: Variable name of the left DataFrame
510
+ left_on: List of left DataFrame column names to join on
511
+ right_on: List of right DataFrame column names to join on
512
+
513
+ Returns:
514
+ Tuple containing:
515
+ - left_on: Modified left join columns with __jk_ prefix where needed
516
+ - right_on: Unchanged right join columns
517
+ - reverse_action: None (no post-join renaming needed)
518
+ - after_join_drop_cols: Right join keys marked for dropping
519
+ """
520
+ join_key_duplication_command = [
521
+ f'pl.col("{ljk.new_name}").alias("__jk_{ljk.new_name}")'
522
+ for ljk in settings.join_input.left_select.join_key_selects if ljk.keep
523
+ ]
524
+
525
+ # Update left_on keys
526
+ for position, left_on_key in enumerate(left_on):
527
+ left_on_select = settings.join_input.left_select.get_select_input_on_new_name(left_on_key)
528
+ if left_on_select and left_on_select.keep:
529
+ left_on[position] = f"__jk_{left_on_select.new_name}"
530
+
531
+ if join_key_duplication_command:
532
+ self._add_code(f"{left_df} = {left_df}.with_columns([{', '.join(join_key_duplication_command)}])")
533
+
534
+ # Calculate columns to drop after join
535
+ left_join_keys_keep = {jk.new_name for jk in settings.join_input.left_select.join_key_selects if jk.keep}
536
+ after_join_drop_cols_right = [
537
+ jk.new_name if jk.new_name not in left_join_keys_keep else jk.new_name + "_right"
538
+ for jk in settings.join_input.right_select.join_key_selects if not jk.keep
539
+ ]
540
+ after_join_drop_cols = list(set(after_join_drop_cols_right))
541
+
542
+ return left_on, right_on, None, after_join_drop_cols
543
+
544
+ def _handle_outer_join_keys(self, settings: input_schema.NodeJoin, right_df: str,
545
+ left_on: List[str], right_on: List[str]) -> Tuple[
546
+ List[str], List[str], Dict, List[str]]:
547
+ """Handle key transformations for outer joins.
548
+
549
+ For outer joins:
550
+ - Both left and right join keys may need to be preserved
551
+ - Right DataFrame join keys are prefixed with __jk_ when they conflict
552
+ - Post-join renaming reverses the __jk_ prefix
553
+
554
+ Args:
555
+ settings: NodeJoin settings containing join configuration
556
+ right_df: Variable name of the right DataFrame
557
+ left_on: List of left DataFrame column names to join on
558
+ right_on: List of right DataFrame column names to join on
559
+
560
+ Returns:
561
+ Tuple containing:
562
+ - left_on: Unchanged left join columns
563
+ - right_on: Modified right join columns with __jk_ prefix where needed
564
+ - reverse_action: Mapping to remove __jk_ prefix after join
565
+ - after_join_drop_cols: Combined list of columns to drop from both sides
566
+ """
567
+ left_join_keys = {jk.new_name for jk in settings.join_input.left_select.join_key_selects}
568
+
569
+ join_keys_to_keep_and_rename = [
570
+ rjk for rjk in settings.join_input.right_select.join_key_selects
571
+ if rjk.keep and rjk.new_name in left_join_keys
572
+ ]
573
+
574
+ join_key_rename_command = {
575
+ rjk.new_name: f"__jk_{rjk.new_name}"
576
+ for rjk in join_keys_to_keep_and_rename
577
+ }
578
+
579
+ # Update right_on keys
580
+ for position, right_on_key in enumerate(right_on):
581
+ right_on_select = settings.join_input.right_select.get_select_input_on_new_name(right_on_key)
582
+ if right_on_select and right_on_select.keep and right_on_select.new_name in left_join_keys:
583
+ right_on[position] = f"__jk_{right_on_select.new_name}"
584
+
585
+ if join_key_rename_command:
586
+ self._add_code(f"{right_df} = {right_df}.rename({join_key_rename_command})")
587
+
588
+ reverse_action = {f"__jk_{rjk.new_name}": rjk.new_name for rjk in join_keys_to_keep_and_rename}
589
+
590
+ # Calculate columns to drop after join
591
+ after_join_drop_cols_left = [
592
+ jk.new_name for jk in settings.join_input.left_select.join_key_selects if not jk.keep
593
+ ]
594
+ after_join_drop_cols_right = [
595
+ jk.new_name if jk.new_name not in left_join_keys else jk.new_name + "_right"
596
+ for jk in settings.join_input.right_select.join_key_selects if not jk.keep
597
+ ]
598
+ after_join_drop_cols = after_join_drop_cols_left + after_join_drop_cols_right
599
+
600
+ return left_on, right_on, reverse_action, after_join_drop_cols
601
+
602
+ def _execute_join_with_post_processing(self, settings: input_schema.NodeJoin, var_name: str,
603
+ left_df: str, right_df: str, left_on: List[str], right_on: List[str],
604
+ after_join_drop_cols: List[str], reverse_action: Optional[Dict]) -> None:
605
+ """Execute the join operation and apply post-processing steps.
606
+
607
+ Generates the actual join code with any necessary post-processing:
608
+ 1. Executes the join operation
609
+ 2. For right joins: Collects to eager mode (Polars requirement)
610
+ 3. Drops unnecessary columns
611
+ 4. Renames temporary columns back to final names
612
+ 5. For right joins: Converts back to lazy mode
613
+
614
+ Args:
615
+ settings: NodeJoin settings containing join configuration
616
+ var_name: Name of the variable to store the result
617
+ left_df: Variable name of the left DataFrame
618
+ right_df: Variable name of the right DataFrame
619
+ left_on: List of left DataFrame column names to join on
620
+ right_on: List of right DataFrame column names to join on
621
+ after_join_drop_cols: List of columns to drop after join
622
+ reverse_action: Dictionary for renaming columns after join (or None)
623
+
624
+ Returns:
625
+ None: Modifies internal state by adding generated code
626
+ """
627
+ self._add_code(f"{var_name} = ({left_df}.join(")
628
+ self._add_code(f" {right_df},")
629
+ self._add_code(f" left_on={left_on},")
630
+ self._add_code(f" right_on={right_on},")
631
+ self._add_code(f' how="{settings.join_input.how}"')
632
+ self._add_code(" )")
633
+
634
+ # Handle right join special case
635
+ if settings.join_input.how == 'right':
636
+ self._add_code(".collect()") # Right join needs to be collected first cause of issue with rename
637
+
638
+ # Apply post-join transformations
639
+ if after_join_drop_cols:
640
+ self._add_code(f".drop({after_join_drop_cols})")
641
+
642
+ if reverse_action:
643
+ self._add_code(f".rename({reverse_action})")
644
+
645
+ # Convert back to lazy for right joins
646
+ if settings.join_input.how == 'right':
647
+ self._add_code(f".lazy()")
648
+
649
+ self._add_code(")")
266
650
  def _handle_group_by(self, settings: input_schema.NodeGroupBy, var_name: str, input_vars: Dict[str, str]) -> None:
267
651
  """Handle group by nodes."""
268
652
  input_df = input_vars.get('main', 'df')
@@ -7,4 +7,4 @@ class Connection:
7
7
  group: str # e.g. source-faker
8
8
  name: str # e.g. source-faker-100000
9
9
  config_setting: Any
10
- type: str = None # e.g. airbyte
10
+ type: str = None