langflow-base-nightly 0.5.0.dev34__py3-none-any.whl → 0.5.0.dev36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langflow/alembic/versions/1cb603706752_modify_uniqueness_constraint_on_file_.py +279 -0
- langflow/components/agents/mcp_component.py +21 -4
- langflow/components/data/kb_ingest.py +15 -16
- langflow/components/docling/__init__.py +198 -0
- langflow/components/docling/docling_inline.py +102 -60
- langflow/components/processing/save_file.py +31 -4
- langflow/initial_setup/starter_projects/Knowledge Ingestion.json +2 -2
- langflow/initial_setup/starter_projects/News Aggregator.json +19 -2
- langflow/initial_setup/starter_projects/Nvidia Remix.json +19 -2
- langflow/services/database/models/file/model.py +4 -2
- {langflow_base_nightly-0.5.0.dev34.dist-info → langflow_base_nightly-0.5.0.dev36.dist-info}/METADATA +1 -1
- {langflow_base_nightly-0.5.0.dev34.dist-info → langflow_base_nightly-0.5.0.dev36.dist-info}/RECORD +14 -13
- {langflow_base_nightly-0.5.0.dev34.dist-info → langflow_base_nightly-0.5.0.dev36.dist-info}/WHEEL +0 -0
- {langflow_base_nightly-0.5.0.dev34.dist-info → langflow_base_nightly-0.5.0.dev36.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""Modify uniqueness constraint on file names
|
|
2
|
+
|
|
3
|
+
Revision ID: 1cb603706752
|
|
4
|
+
Revises: 3162e83e485f
|
|
5
|
+
Create Date: 2025-07-24 07:02:14.896583
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import re
|
|
12
|
+
import time
|
|
13
|
+
from typing import Sequence, Union, Iterable, Optional, Set, Tuple
|
|
14
|
+
|
|
15
|
+
from alembic import op
|
|
16
|
+
import sqlalchemy as sa
|
|
17
|
+
from sqlalchemy import inspect
|
|
18
|
+
|
|
19
|
+
# revision identifiers, used by Alembic.
|
|
20
|
+
revision: str = "1cb603706752"
|
|
21
|
+
down_revision: Union[str, None] = "3162e83e485f"
|
|
22
|
+
branch_labels: Union[str, Sequence[str], None] = None
|
|
23
|
+
depends_on: Union[str, Sequence[str], None] = None
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
# Behavior constants
|
|
28
|
+
DUPLICATE_SUFFIX_START = 2 # first suffix to use, e.g., "name_2.ext"
|
|
29
|
+
BATCH_SIZE = 1000 # Process duplicates in batches for large datasets
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _get_unique_constraints_by_columns(
|
|
33
|
+
inspector, table: str, expected_cols: Iterable[str]
|
|
34
|
+
) -> Optional[str]:
|
|
35
|
+
"""Return the name of a unique constraint that matches the exact set of expected columns."""
|
|
36
|
+
expected = set(expected_cols)
|
|
37
|
+
for c in inspector.get_unique_constraints(table):
|
|
38
|
+
cols = set(c.get("column_names") or [])
|
|
39
|
+
if cols == expected:
|
|
40
|
+
return c.get("name")
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _split_base_ext(name: str) -> Tuple[str, str]:
|
|
45
|
+
"""Split a filename into (base, ext) where ext does not include the leading dot; ext may be ''."""
|
|
46
|
+
if "." in name:
|
|
47
|
+
base, ext = name.rsplit(".", 1)
|
|
48
|
+
return base, ext
|
|
49
|
+
return name, ""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _escape_like(s: str) -> str:
|
|
53
|
+
# escape backslash first, then SQL LIKE wildcards
|
|
54
|
+
return s.replace("\\", "\\\\").replace("%", r"\%").replace("_", r"\_")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _like_for_suffixes(base: str, ext: str) -> str:
|
|
58
|
+
eb = _escape_like(base)
|
|
59
|
+
if ext:
|
|
60
|
+
ex = ext.replace("%", r"\%").replace("_", r"\_")
|
|
61
|
+
return f"{eb}\\_%." + ex # literal underscore
|
|
62
|
+
else:
|
|
63
|
+
return f"{eb}\\_%"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _next_available_name(conn, user_id: str, base_name: str) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Compute the next available non-conflicting name for a given user.
|
|
69
|
+
Handles names with or without extensions and existing _N suffixes.
|
|
70
|
+
"""
|
|
71
|
+
base, ext = _split_base_ext(base_name)
|
|
72
|
+
|
|
73
|
+
# Load all sibling names once
|
|
74
|
+
rows = conn.execute(
|
|
75
|
+
sa.text("""
|
|
76
|
+
SELECT name
|
|
77
|
+
FROM file
|
|
78
|
+
WHERE user_id = :uid
|
|
79
|
+
AND (name = :base_name OR name LIKE :like ESCAPE '\\')
|
|
80
|
+
"""),
|
|
81
|
+
{"uid": user_id, "base_name": base_name, "like": _like_for_suffixes(base, ext)},
|
|
82
|
+
).scalars().all()
|
|
83
|
+
|
|
84
|
+
taken: Set[str] = set(rows)
|
|
85
|
+
|
|
86
|
+
# Pattern to detect base_N(.ext) and capture N
|
|
87
|
+
if ext:
|
|
88
|
+
rx = re.compile(rf"^{re.escape(base)}_(\d+)\.{re.escape(ext)}$")
|
|
89
|
+
else:
|
|
90
|
+
rx = re.compile(rf"^{re.escape(base)}_(\d+)$")
|
|
91
|
+
|
|
92
|
+
max_n = 1
|
|
93
|
+
for n in rows:
|
|
94
|
+
m = rx.match(n)
|
|
95
|
+
if m:
|
|
96
|
+
max_n = max(max_n, int(m.group(1)))
|
|
97
|
+
|
|
98
|
+
n = max(max_n + 1, DUPLICATE_SUFFIX_START)
|
|
99
|
+
while True:
|
|
100
|
+
candidate = f"{base}_{n}.{ext}" if ext else f"{base}_{n}"
|
|
101
|
+
if candidate not in taken:
|
|
102
|
+
return candidate
|
|
103
|
+
n += 1
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _handle_duplicates_before_upgrade(conn) -> None:
|
|
107
|
+
"""
|
|
108
|
+
Ensure (user_id, name) is unique by renaming older duplicates before adding the composite unique constraint.
|
|
109
|
+
Keeps the most recently updated/created/id-highest record; renames the rest with _N suffix.
|
|
110
|
+
"""
|
|
111
|
+
logger.info("Scanning for duplicate file names per user...")
|
|
112
|
+
duplicates = conn.execute(
|
|
113
|
+
sa.text(
|
|
114
|
+
"""
|
|
115
|
+
SELECT user_id, name, COUNT(*) AS cnt
|
|
116
|
+
FROM file
|
|
117
|
+
GROUP BY user_id, name
|
|
118
|
+
HAVING COUNT(*) > 1
|
|
119
|
+
"""
|
|
120
|
+
)
|
|
121
|
+
).fetchall()
|
|
122
|
+
|
|
123
|
+
if not duplicates:
|
|
124
|
+
logger.info("No duplicates found.")
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
logger.info("Found %d duplicate sets. Resolving...", len(duplicates))
|
|
128
|
+
|
|
129
|
+
# Add progress indicator for large datasets
|
|
130
|
+
if len(duplicates) > 100:
|
|
131
|
+
logger.info("Large number of duplicates detected. This may take several minutes...")
|
|
132
|
+
|
|
133
|
+
# Wrap in a nested transaction so we fail cleanly on any error
|
|
134
|
+
with conn.begin_nested():
|
|
135
|
+
# Process duplicates in batches for better performance on large datasets
|
|
136
|
+
for batch_start in range(0, len(duplicates), BATCH_SIZE):
|
|
137
|
+
batch_end = min(batch_start + BATCH_SIZE, len(duplicates))
|
|
138
|
+
batch = duplicates[batch_start:batch_end]
|
|
139
|
+
|
|
140
|
+
if len(duplicates) > BATCH_SIZE:
|
|
141
|
+
logger.info("Processing batch %d-%d of %d duplicate sets...",
|
|
142
|
+
batch_start + 1, batch_end, len(duplicates))
|
|
143
|
+
|
|
144
|
+
for user_id, name, cnt in batch:
|
|
145
|
+
logger.debug("Resolving duplicates for user=%s, name=%r (count=%s)", user_id, name, cnt)
|
|
146
|
+
|
|
147
|
+
file_ids = conn.execute(
|
|
148
|
+
sa.text(
|
|
149
|
+
"""
|
|
150
|
+
SELECT id
|
|
151
|
+
FROM file
|
|
152
|
+
WHERE user_id = :uid AND name = :name
|
|
153
|
+
ORDER BY updated_at DESC, created_at DESC, id DESC
|
|
154
|
+
"""
|
|
155
|
+
),
|
|
156
|
+
{"uid": user_id, "name": name},
|
|
157
|
+
).scalars().all()
|
|
158
|
+
|
|
159
|
+
# Keep the first (most recent), rename the rest
|
|
160
|
+
for file_id in file_ids[1:]:
|
|
161
|
+
new_name = _next_available_name(conn, user_id, name)
|
|
162
|
+
conn.execute(
|
|
163
|
+
sa.text("UPDATE file SET name = :new_name WHERE id = :fid"),
|
|
164
|
+
{"new_name": new_name, "fid": file_id},
|
|
165
|
+
)
|
|
166
|
+
logger.debug("Renamed id=%s: %r -> %r", file_id, name, new_name)
|
|
167
|
+
|
|
168
|
+
# Progress update for large batches
|
|
169
|
+
if len(duplicates) > BATCH_SIZE and batch_end < len(duplicates):
|
|
170
|
+
logger.info("Completed %d of %d duplicate sets (%.1f%%)",
|
|
171
|
+
batch_end, len(duplicates), (batch_end / len(duplicates)) * 100)
|
|
172
|
+
|
|
173
|
+
logger.info("Duplicate resolution completed.")
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def upgrade() -> None:
|
|
177
|
+
start_time = time.time()
|
|
178
|
+
logger.info("Starting upgrade: adding composite unique (name, user_id) on file")
|
|
179
|
+
|
|
180
|
+
conn = op.get_bind()
|
|
181
|
+
inspector = inspect(conn)
|
|
182
|
+
|
|
183
|
+
# 1) Resolve pre-existing duplicates so the new unique can be created
|
|
184
|
+
duplicate_start = time.time()
|
|
185
|
+
_handle_duplicates_before_upgrade(conn)
|
|
186
|
+
duplicate_duration = time.time() - duplicate_start
|
|
187
|
+
|
|
188
|
+
if duplicate_duration > 1.0: # Only log if it took more than 1 second
|
|
189
|
+
logger.info("Duplicate resolution completed in %.2f seconds", duplicate_duration)
|
|
190
|
+
|
|
191
|
+
# 2) Detect existing single-column unique on name (if any)
|
|
192
|
+
inspector = inspect(conn) # refresh inspector
|
|
193
|
+
single_name_uc = _get_unique_constraints_by_columns(inspector, "file", {"name"})
|
|
194
|
+
composite_uc = _get_unique_constraints_by_columns(inspector, "file", {"name", "user_id"})
|
|
195
|
+
|
|
196
|
+
# 3) Use a unified, reflection-based batch_alter_table for both Postgres and SQLite.
|
|
197
|
+
# recreate="always" ensures a safe table rebuild on SQLite and a standard alter on Postgres.
|
|
198
|
+
constraint_start = time.time()
|
|
199
|
+
with op.batch_alter_table("file", recreate="always") as batch_op:
|
|
200
|
+
# Drop old single-column unique if present
|
|
201
|
+
if single_name_uc:
|
|
202
|
+
logger.info("Dropping existing single-column unique: %s", single_name_uc)
|
|
203
|
+
batch_op.drop_constraint(single_name_uc, type_="unique")
|
|
204
|
+
|
|
205
|
+
# Create composite unique if not already present
|
|
206
|
+
if not composite_uc:
|
|
207
|
+
logger.info("Creating composite unique: file_name_user_id_key on (name, user_id)")
|
|
208
|
+
batch_op.create_unique_constraint("file_name_user_id_key", ["name", "user_id"])
|
|
209
|
+
else:
|
|
210
|
+
logger.info("Composite unique already present: %s", composite_uc)
|
|
211
|
+
|
|
212
|
+
constraint_duration = time.time() - constraint_start
|
|
213
|
+
if constraint_duration > 1.0: # Only log if it took more than 1 second
|
|
214
|
+
logger.info("Constraint operations completed in %.2f seconds", constraint_duration)
|
|
215
|
+
|
|
216
|
+
total_duration = time.time() - start_time
|
|
217
|
+
logger.info("Upgrade completed successfully in %.2f seconds", total_duration)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def downgrade() -> None:
|
|
221
|
+
start_time = time.time()
|
|
222
|
+
logger.info("Starting downgrade: reverting to single-column unique on (name)")
|
|
223
|
+
|
|
224
|
+
conn = op.get_bind()
|
|
225
|
+
inspector = inspect(conn)
|
|
226
|
+
|
|
227
|
+
# 1) Ensure no cross-user duplicates on name (since we'll enforce global uniqueness on name)
|
|
228
|
+
logger.info("Checking for cross-user duplicate names prior to downgrade...")
|
|
229
|
+
validation_start = time.time()
|
|
230
|
+
|
|
231
|
+
dup_names = conn.execute(
|
|
232
|
+
sa.text(
|
|
233
|
+
"""
|
|
234
|
+
SELECT name, COUNT(*) AS cnt
|
|
235
|
+
FROM file
|
|
236
|
+
GROUP BY name
|
|
237
|
+
HAVING COUNT(*) > 1
|
|
238
|
+
"""
|
|
239
|
+
)
|
|
240
|
+
).fetchall()
|
|
241
|
+
|
|
242
|
+
validation_duration = time.time() - validation_start
|
|
243
|
+
if validation_duration > 1.0: # Only log if it took more than 1 second
|
|
244
|
+
logger.info("Validation completed in %.2f seconds", validation_duration)
|
|
245
|
+
|
|
246
|
+
if dup_names:
|
|
247
|
+
examples = [row[0] for row in dup_names[:10]]
|
|
248
|
+
raise RuntimeError(
|
|
249
|
+
"Downgrade aborted: duplicate names exist across users. "
|
|
250
|
+
f"Examples: {examples}{'...' if len(dup_names) > 10 else ''}. "
|
|
251
|
+
"Rename conflicting files before downgrading."
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# 2) Detect constraints
|
|
255
|
+
inspector = inspect(conn) # refresh
|
|
256
|
+
composite_uc = _get_unique_constraints_by_columns(inspector, "file", {"name", "user_id"})
|
|
257
|
+
single_name_uc = _get_unique_constraints_by_columns(inspector, "file", {"name"})
|
|
258
|
+
|
|
259
|
+
# 3) Perform alteration using batch with reflect to preserve other objects
|
|
260
|
+
constraint_start = time.time()
|
|
261
|
+
with op.batch_alter_table("file", recreate="always") as batch_op:
|
|
262
|
+
if composite_uc:
|
|
263
|
+
logger.info("Dropping composite unique: %s", composite_uc)
|
|
264
|
+
batch_op.drop_constraint(composite_uc, type_="unique")
|
|
265
|
+
else:
|
|
266
|
+
logger.info("No composite unique found to drop.")
|
|
267
|
+
|
|
268
|
+
if not single_name_uc:
|
|
269
|
+
logger.info("Creating single-column unique: file_name_key on (name)")
|
|
270
|
+
batch_op.create_unique_constraint("file_name_key", ["name"])
|
|
271
|
+
else:
|
|
272
|
+
logger.info("Single-column unique already present: %s", single_name_uc)
|
|
273
|
+
|
|
274
|
+
constraint_duration = time.time() - constraint_start
|
|
275
|
+
if constraint_duration > 1.0: # Only log if it took more than 1 second
|
|
276
|
+
logger.info("Constraint operations completed in %.2f seconds", constraint_duration)
|
|
277
|
+
|
|
278
|
+
total_duration = time.time() - start_time
|
|
279
|
+
logger.info("Downgrade completed successfully in %.2f seconds", total_duration)
|
|
@@ -16,14 +16,14 @@ from langflow.base.mcp.util import (
|
|
|
16
16
|
)
|
|
17
17
|
from langflow.custom.custom_component.component_with_cache import ComponentWithCache
|
|
18
18
|
from langflow.inputs.inputs import InputTypes # noqa: TC001
|
|
19
|
-
from langflow.io import DropdownInput, McpInput, MessageTextInput, Output
|
|
19
|
+
from langflow.io import DropdownInput, McpInput, MessageTextInput, Output, SecretStrInput
|
|
20
20
|
from langflow.io.schema import flatten_schema, schema_to_langflow_inputs
|
|
21
21
|
from langflow.logging import logger
|
|
22
22
|
from langflow.schema.dataframe import DataFrame
|
|
23
23
|
from langflow.schema.message import Message
|
|
24
|
-
from langflow.services.auth.utils import create_user_longterm_token
|
|
25
24
|
|
|
26
25
|
# Import get_server from the backend API
|
|
26
|
+
from langflow.services.auth.utils import create_user_longterm_token, get_current_user
|
|
27
27
|
from langflow.services.database.models.user.crud import get_user_by_id
|
|
28
28
|
from langflow.services.deps import get_session, get_settings_service, get_storage_service
|
|
29
29
|
|
|
@@ -96,6 +96,13 @@ class MCPToolsComponent(ComponentWithCache):
|
|
|
96
96
|
show=False,
|
|
97
97
|
tool_mode=False,
|
|
98
98
|
),
|
|
99
|
+
SecretStrInput(
|
|
100
|
+
name="api_key",
|
|
101
|
+
display_name="Langflow API Key",
|
|
102
|
+
info="Langflow API key for authentication when fetching MCP servers and tools.",
|
|
103
|
+
required=False,
|
|
104
|
+
advanced=True,
|
|
105
|
+
),
|
|
99
106
|
]
|
|
100
107
|
|
|
101
108
|
outputs = [
|
|
@@ -155,8 +162,18 @@ class MCPToolsComponent(ComponentWithCache):
|
|
|
155
162
|
|
|
156
163
|
try:
|
|
157
164
|
async for db in get_session():
|
|
158
|
-
|
|
159
|
-
|
|
165
|
+
# TODO: In 1.6, this may need to be removed or adjusted
|
|
166
|
+
# Try to get the super user token, if possible
|
|
167
|
+
if self.api_key:
|
|
168
|
+
current_user = await get_current_user(
|
|
169
|
+
token=None,
|
|
170
|
+
query_param=self.api_key,
|
|
171
|
+
header_param=None,
|
|
172
|
+
db=db,
|
|
173
|
+
)
|
|
174
|
+
else:
|
|
175
|
+
user_id, _ = await create_user_longterm_token(db)
|
|
176
|
+
current_user = await get_user_by_id(db, user_id)
|
|
160
177
|
|
|
161
178
|
# Try to get server config from DB/API
|
|
162
179
|
server_config = await get_server(
|
|
@@ -139,8 +139,8 @@ class KBIngestionComponent(Component):
|
|
|
139
139
|
{
|
|
140
140
|
"column_name": "text",
|
|
141
141
|
"vectorize": True,
|
|
142
|
-
"identifier":
|
|
143
|
-
}
|
|
142
|
+
"identifier": True,
|
|
143
|
+
},
|
|
144
144
|
],
|
|
145
145
|
),
|
|
146
146
|
IntInput(
|
|
@@ -187,9 +187,8 @@ class KBIngestionComponent(Component):
|
|
|
187
187
|
df_columns = set(df_source.columns)
|
|
188
188
|
for config in config_list:
|
|
189
189
|
col_name = config.get("column_name")
|
|
190
|
-
if col_name not in df_columns
|
|
190
|
+
if col_name not in df_columns:
|
|
191
191
|
msg = f"Column '{col_name}' not found in DataFrame. Available columns: {sorted(df_columns)}"
|
|
192
|
-
self.log(f"Warning: {msg}")
|
|
193
192
|
raise ValueError(msg)
|
|
194
193
|
|
|
195
194
|
return config_list
|
|
@@ -295,9 +294,7 @@ class KBIngestionComponent(Component):
|
|
|
295
294
|
if not cfg_path.exists():
|
|
296
295
|
cfg_path.write_text(json.dumps(config_list, indent=2))
|
|
297
296
|
|
|
298
|
-
except
|
|
299
|
-
if not self.silent_errors:
|
|
300
|
-
raise
|
|
297
|
+
except (OSError, TypeError, ValueError) as e:
|
|
301
298
|
self.log(f"Error saving KB files: {e}")
|
|
302
299
|
|
|
303
300
|
def _build_column_metadata(self, config_list: list[dict[str, Any]], df_source: pd.DataFrame) -> dict[str, Any]:
|
|
@@ -367,9 +364,7 @@ class KBIngestionComponent(Component):
|
|
|
367
364
|
chroma.add_documents(documents)
|
|
368
365
|
self.log(f"Added {len(documents)} documents to vector store '{self.knowledge_base}'")
|
|
369
366
|
|
|
370
|
-
except
|
|
371
|
-
if not self.silent_errors:
|
|
372
|
-
raise
|
|
367
|
+
except (OSError, ValueError, RuntimeError) as e:
|
|
373
368
|
self.log(f"Error creating vector store: {e}")
|
|
374
369
|
|
|
375
370
|
def _convert_df_to_data_objects(self, df_source: pd.DataFrame, config_list: list[dict[str, Any]]) -> list[Data]:
|
|
@@ -407,16 +402,22 @@ class KBIngestionComponent(Component):
|
|
|
407
402
|
|
|
408
403
|
# Convert each row to a Data object
|
|
409
404
|
for _, row in df_source.iterrows():
|
|
410
|
-
# Build content text from
|
|
411
|
-
|
|
405
|
+
# Build content text from identifier columns using list comprehension
|
|
406
|
+
identifier_parts = [str(row[col]) for col in content_cols if col in row and pd.notna(row[col])]
|
|
412
407
|
|
|
413
|
-
|
|
408
|
+
# Join all parts into a single string
|
|
409
|
+
page_content = " ".join(identifier_parts)
|
|
414
410
|
|
|
415
411
|
# Build metadata from NON-vectorized columns only (simple key-value pairs)
|
|
416
412
|
data_dict = {
|
|
417
413
|
"text": page_content, # Main content for vectorization
|
|
418
414
|
}
|
|
419
415
|
|
|
416
|
+
# Add identifier columns if they exist
|
|
417
|
+
if identifier_cols:
|
|
418
|
+
identifier_parts = [str(row[col]) for col in identifier_cols if col in row and pd.notna(row[col])]
|
|
419
|
+
page_content = " ".join(identifier_parts)
|
|
420
|
+
|
|
420
421
|
# Add metadata columns as simple key-value pairs
|
|
421
422
|
for col in df_source.columns:
|
|
422
423
|
if col not in content_cols and col in row and pd.notna(row[col]):
|
|
@@ -526,9 +527,7 @@ class KBIngestionComponent(Component):
|
|
|
526
527
|
|
|
527
528
|
return Data(data=meta)
|
|
528
529
|
|
|
529
|
-
except
|
|
530
|
-
if not self.silent_errors:
|
|
531
|
-
raise
|
|
530
|
+
except (OSError, ValueError, RuntimeError, KeyError) as e:
|
|
532
531
|
self.log(f"Error in KB ingestion: {e}")
|
|
533
532
|
self.status = f"❌ KB ingestion failed: {e}"
|
|
534
533
|
return Data(data={"error": str(e), "kb_name": self.knowledge_base})
|
|
@@ -1,7 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import signal
|
|
4
|
+
import sys
|
|
5
|
+
import traceback
|
|
6
|
+
from contextlib import suppress
|
|
3
7
|
from typing import TYPE_CHECKING, Any
|
|
4
8
|
|
|
9
|
+
from loguru import logger
|
|
10
|
+
|
|
5
11
|
from langflow.components._importing import import_mod
|
|
6
12
|
|
|
7
13
|
if TYPE_CHECKING:
|
|
@@ -41,3 +47,195 @@ def __getattr__(attr_name: str) -> Any:
|
|
|
41
47
|
|
|
42
48
|
def __dir__() -> list[str]:
|
|
43
49
|
return list(__all__)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def docling_worker(file_paths: list[str], queue, pipeline: str, ocr_engine: str):
|
|
53
|
+
"""Worker function for processing files with Docling in a separate process."""
|
|
54
|
+
# Signal handling for graceful shutdown
|
|
55
|
+
shutdown_requested = False
|
|
56
|
+
|
|
57
|
+
def signal_handler(signum: int, frame) -> None: # noqa: ARG001
|
|
58
|
+
"""Handle shutdown signals gracefully."""
|
|
59
|
+
nonlocal shutdown_requested
|
|
60
|
+
signal_names: dict[int, str] = {signal.SIGTERM: "SIGTERM", signal.SIGINT: "SIGINT"}
|
|
61
|
+
signal_name = signal_names.get(signum, f"signal {signum}")
|
|
62
|
+
|
|
63
|
+
logger.debug(f"Docling worker received {signal_name}, initiating graceful shutdown...")
|
|
64
|
+
shutdown_requested = True
|
|
65
|
+
|
|
66
|
+
# Send shutdown notification to parent process
|
|
67
|
+
with suppress(Exception):
|
|
68
|
+
queue.put({"error": f"Worker interrupted by {signal_name}", "shutdown": True})
|
|
69
|
+
|
|
70
|
+
# Exit gracefully
|
|
71
|
+
sys.exit(0)
|
|
72
|
+
|
|
73
|
+
def check_shutdown() -> None:
|
|
74
|
+
"""Check if shutdown was requested and exit if so."""
|
|
75
|
+
if shutdown_requested:
|
|
76
|
+
logger.info("Shutdown requested, exiting worker...")
|
|
77
|
+
|
|
78
|
+
with suppress(Exception):
|
|
79
|
+
queue.put({"error": "Worker shutdown requested", "shutdown": True})
|
|
80
|
+
|
|
81
|
+
sys.exit(0)
|
|
82
|
+
|
|
83
|
+
# Register signal handlers early
|
|
84
|
+
try:
|
|
85
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
|
86
|
+
signal.signal(signal.SIGINT, signal_handler)
|
|
87
|
+
logger.debug("Signal handlers registered for graceful shutdown")
|
|
88
|
+
except (OSError, ValueError) as e:
|
|
89
|
+
# Some signals might not be available on all platforms
|
|
90
|
+
logger.warning(f"Warning: Could not register signal handlers: {e}")
|
|
91
|
+
|
|
92
|
+
# Check for shutdown before heavy imports
|
|
93
|
+
check_shutdown()
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
|
97
|
+
from docling.datamodel.pipeline_options import (
|
|
98
|
+
OcrOptions,
|
|
99
|
+
PdfPipelineOptions,
|
|
100
|
+
VlmPipelineOptions,
|
|
101
|
+
)
|
|
102
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
|
103
|
+
from docling.models.factories import get_ocr_factory
|
|
104
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
105
|
+
|
|
106
|
+
# Check for shutdown after imports
|
|
107
|
+
check_shutdown()
|
|
108
|
+
logger.debug("Docling dependencies loaded successfully")
|
|
109
|
+
|
|
110
|
+
except ModuleNotFoundError:
|
|
111
|
+
msg = (
|
|
112
|
+
"Docling is an optional dependency of Langflow. "
|
|
113
|
+
"Install with `uv pip install 'langflow[docling]'` "
|
|
114
|
+
"or refer to the documentation"
|
|
115
|
+
)
|
|
116
|
+
queue.put({"error": msg})
|
|
117
|
+
return
|
|
118
|
+
except ImportError as e:
|
|
119
|
+
# A different import failed (e.g., a transitive dependency); preserve details.
|
|
120
|
+
queue.put({"error": f"Failed to import a Docling dependency: {e}"})
|
|
121
|
+
return
|
|
122
|
+
except KeyboardInterrupt:
|
|
123
|
+
logger.warning("KeyboardInterrupt during imports, exiting...")
|
|
124
|
+
queue.put({"error": "Worker interrupted during imports", "shutdown": True})
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
# Configure the standard PDF pipeline
|
|
128
|
+
def _get_standard_opts() -> PdfPipelineOptions:
|
|
129
|
+
check_shutdown() # Check before heavy operations
|
|
130
|
+
|
|
131
|
+
pipeline_options = PdfPipelineOptions()
|
|
132
|
+
pipeline_options.do_ocr = ocr_engine != ""
|
|
133
|
+
if pipeline_options.do_ocr:
|
|
134
|
+
ocr_factory = get_ocr_factory(
|
|
135
|
+
allow_external_plugins=False,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
ocr_options: OcrOptions = ocr_factory.create_options(
|
|
139
|
+
kind=ocr_engine,
|
|
140
|
+
)
|
|
141
|
+
pipeline_options.ocr_options = ocr_options
|
|
142
|
+
return pipeline_options
|
|
143
|
+
|
|
144
|
+
# Configure the VLM pipeline
|
|
145
|
+
def _get_vlm_opts() -> VlmPipelineOptions:
|
|
146
|
+
check_shutdown() # Check before heavy operations
|
|
147
|
+
return VlmPipelineOptions()
|
|
148
|
+
|
|
149
|
+
# Configure the main format options and create the DocumentConverter()
|
|
150
|
+
def _get_converter() -> DocumentConverter:
|
|
151
|
+
check_shutdown() # Check before heavy operations
|
|
152
|
+
|
|
153
|
+
if pipeline == "standard":
|
|
154
|
+
pdf_format_option = PdfFormatOption(
|
|
155
|
+
pipeline_options=_get_standard_opts(),
|
|
156
|
+
)
|
|
157
|
+
elif pipeline == "vlm":
|
|
158
|
+
pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())
|
|
159
|
+
else:
|
|
160
|
+
msg = f"Unknown pipeline: {pipeline!r}"
|
|
161
|
+
raise ValueError(msg)
|
|
162
|
+
|
|
163
|
+
format_options: dict[InputFormat, FormatOption] = {
|
|
164
|
+
InputFormat.PDF: pdf_format_option,
|
|
165
|
+
InputFormat.IMAGE: pdf_format_option,
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return DocumentConverter(format_options=format_options)
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
# Check for shutdown before creating converter (can be slow)
|
|
172
|
+
check_shutdown()
|
|
173
|
+
logger.info(f"Initializing {pipeline} pipeline with OCR: {ocr_engine or 'disabled'}")
|
|
174
|
+
|
|
175
|
+
converter = _get_converter()
|
|
176
|
+
|
|
177
|
+
# Check for shutdown before processing files
|
|
178
|
+
check_shutdown()
|
|
179
|
+
logger.info(f"Starting to process {len(file_paths)} files...")
|
|
180
|
+
|
|
181
|
+
# Process files with periodic shutdown checks
|
|
182
|
+
results = []
|
|
183
|
+
for i, file_path in enumerate(file_paths):
|
|
184
|
+
# Check for shutdown before processing each file
|
|
185
|
+
check_shutdown()
|
|
186
|
+
|
|
187
|
+
logger.debug(f"Processing file {i + 1}/{len(file_paths)}: {file_path}")
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
# Process single file (we can't easily interrupt convert_all)
|
|
191
|
+
single_result = converter.convert_all([file_path])
|
|
192
|
+
results.extend(single_result)
|
|
193
|
+
|
|
194
|
+
# Check for shutdown after each file
|
|
195
|
+
check_shutdown()
|
|
196
|
+
|
|
197
|
+
except (OSError, ValueError, RuntimeError, ImportError) as file_error:
|
|
198
|
+
# Handle specific file processing errors
|
|
199
|
+
logger.error(f"Error processing file {file_path}: {file_error}")
|
|
200
|
+
# Continue with other files, but check for shutdown
|
|
201
|
+
check_shutdown()
|
|
202
|
+
except Exception as file_error: # noqa: BLE001
|
|
203
|
+
# Catch any other unexpected errors to prevent worker crash
|
|
204
|
+
logger.error(f"Unexpected error processing file {file_path}: {file_error}")
|
|
205
|
+
# Continue with other files, but check for shutdown
|
|
206
|
+
check_shutdown()
|
|
207
|
+
|
|
208
|
+
# Final shutdown check before sending results
|
|
209
|
+
check_shutdown()
|
|
210
|
+
|
|
211
|
+
# Process the results while maintaining the original structure
|
|
212
|
+
processed_data = [
|
|
213
|
+
{"document": res.document, "file_path": str(res.input.file), "status": res.status.name}
|
|
214
|
+
if res.status == ConversionStatus.SUCCESS
|
|
215
|
+
else None
|
|
216
|
+
for res in results
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
logger.info(f"Successfully processed {len([d for d in processed_data if d])} files")
|
|
220
|
+
queue.put(processed_data)
|
|
221
|
+
|
|
222
|
+
except KeyboardInterrupt:
|
|
223
|
+
logger.warning("KeyboardInterrupt during processing, exiting gracefully...")
|
|
224
|
+
queue.put({"error": "Worker interrupted during processing", "shutdown": True})
|
|
225
|
+
return
|
|
226
|
+
except Exception as e: # noqa: BLE001
|
|
227
|
+
if shutdown_requested:
|
|
228
|
+
logger.exception("Exception occurred during shutdown, exiting...")
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
# Send any processing error to the main process with traceback
|
|
232
|
+
error_info = {"error": str(e), "traceback": traceback.format_exc()}
|
|
233
|
+
logger.error(f"Error in worker: {error_info}")
|
|
234
|
+
queue.put(error_info)
|
|
235
|
+
finally:
|
|
236
|
+
logger.info("Docling worker finishing...")
|
|
237
|
+
# Ensure we don't leave any hanging processes
|
|
238
|
+
if shutdown_requested:
|
|
239
|
+
logger.debug("Worker shutdown completed")
|
|
240
|
+
else:
|
|
241
|
+
logger.debug("Worker completed normally")
|