nv-ingest-api 2025.10.8.dev20251008__py3-none-any.whl → 2025.10.9.dev20251009__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -332,6 +332,7 @@ def _extract_page_elements(
332
332
 
333
333
  # Process each extracted element based on extraction flags
334
334
  for page_idx, page_element in page_element_results:
335
+ page_reading_index = page_idx + 1
335
336
  # Skip elements that shouldn't be extracted based on flags
336
337
  if (not extract_tables) and (page_element.type_string == "table"):
337
338
  continue
@@ -347,7 +348,7 @@ def _extract_page_elements(
347
348
  # Construct metadata for the page element
348
349
  page_element_meta = construct_page_element_metadata(
349
350
  page_element,
350
- page_idx,
351
+ page_reading_index,
351
352
  page_count,
352
353
  source_metadata,
353
354
  base_unified_metadata,
@@ -473,6 +474,7 @@ def pdfium_extractor(
473
474
  for page_idx in range(page_count):
474
475
  page = doc.get_page(page_idx)
475
476
  page_width, page_height = page.get_size()
477
+ page_reading_index = page_idx + 1
476
478
 
477
479
  # Text extraction
478
480
  if extract_text:
@@ -481,7 +483,7 @@ def pdfium_extractor(
481
483
  text_meta = construct_text_metadata(
482
484
  [page_text],
483
485
  pdf_metadata.keywords,
484
- page_idx,
486
+ page_reading_index,
485
487
  -1,
486
488
  -1,
487
489
  -1,
@@ -499,7 +501,7 @@ def pdfium_extractor(
499
501
  image_data = _extract_page_images(
500
502
  extract_images_method,
501
503
  page,
502
- page_idx,
504
+ page_reading_index,
503
505
  page_width,
504
506
  page_height,
505
507
  page_count,
@@ -518,7 +520,7 @@ def pdfium_extractor(
518
520
  base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
519
521
  image_meta = construct_image_metadata_from_base64(
520
522
  base64_image,
521
- page_idx,
523
+ page_reading_index,
522
524
  page_count,
523
525
  source_metadata,
524
526
  base_unified_metadata,
@@ -326,16 +326,52 @@ class NimClient:
326
326
 
327
327
  outputs = [grpcclient.InferRequestedOutput(output_name) for output_name in output_names]
328
328
 
329
- response = self.client.infer(
330
- model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
331
- )
329
+ base_delay = 0.5
330
+ attempt = 0
331
+ retries_429 = 0
332
+ max_grpc_retries = self.max_429_retries
332
333
 
333
- logger.debug(f"gRPC inference response: {response}")
334
+ while attempt < self.max_retries:
335
+ try:
336
+ response = self.client.infer(
337
+ model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
338
+ )
334
339
 
335
- if len(outputs) == 1:
336
- return response.as_numpy(outputs[0].name())
337
- else:
338
- return [response.as_numpy(output.name()) for output in outputs]
340
+ logger.debug(f"gRPC inference response: {response}")
341
+
342
+ if len(outputs) == 1:
343
+ return response.as_numpy(outputs[0].name())
344
+ else:
345
+ return [response.as_numpy(output.name()) for output in outputs]
346
+
347
+ except grpcclient.InferenceServerException as e:
348
+ status = e.status()
349
+ if status == "StatusCode.UNAVAILABLE" and "Exceeds maximum queue size".lower() in e.message().lower():
350
+ retries_429 += 1
351
+ logger.warning(
352
+ f"Received gRPC {status} for model '{model_name}'. "
353
+ f"Attempt {retries_429} of {max_grpc_retries}."
354
+ )
355
+ if retries_429 >= max_grpc_retries:
356
+ logger.error(f"Max retries for gRPC {status} exceeded for model '{model_name}'.")
357
+ raise
358
+
359
+ backoff_time = base_delay * (2**retries_429)
360
+ time.sleep(backoff_time)
361
+ continue
362
+
363
+ else:
364
+ # For other server-side errors (e.g., INVALID_ARGUMENT, NOT_FOUND),
365
+ # retrying will not help. We should fail fast.
366
+ logger.error(
367
+ f"Received non-retryable gRPC error from Triton for model '{model_name}': {e.message()}"
368
+ )
369
+ raise
370
+
371
+ except Exception as e:
372
+ # Catch any other unexpected exceptions (e.g., network issues not caught by Triton client)
373
+ logger.error(f"An unexpected error occurred during gRPC inference for model '{model_name}': {e}")
374
+ raise
339
375
 
340
376
  def _http_infer(self, formatted_input: dict) -> dict:
341
377
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.10.8.dev20251008
3
+ Version: 2025.10.9.dev20251009
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -32,7 +32,7 @@ nv_ingest_api/internal/extract/pdf/engines/__init__.py,sha256=u4GnAZmDKRl0RwYGIR
32
32
  nv_ingest_api/internal/extract/pdf/engines/adobe.py,sha256=VT0dEqkU-y2uGkaCqxtKYov_Q8R1028UQVBchgMLca4,17466
33
33
  nv_ingest_api/internal/extract/pdf/engines/llama.py,sha256=MwzM-n2tu0FHM0wDe_0mONLlzHrPte7EOTuPtzCh7Zs,8384
34
34
  nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py,sha256=IVbNcH_phMiRSxnkZ04pGfQrPJ-x1zVR3hXyhxv7juc,22977
35
- nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=CCfxcHAS3mED8zD6GKTGNUi02CzBMs7FsSopevhsiyk,22720
35
+ nv_ingest_api/internal/extract/pdf/engines/pdfium.py,sha256=yAndWwh_k00nP0spYGxlewP3RBPxE4QR-b3U3VgXnBo,22852
36
36
  nv_ingest_api/internal/extract/pdf/engines/tika.py,sha256=6GyR2l6EsgNZl9jnYDXLeKNK9Fj2Mw9y2UWDq-eSkOc,3169
37
37
  nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py,sha256=jrv2B4VZAH4PevAQrFz965qz8UyXq3rViiOTbGLejec,14908
38
38
  nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py,sha256=uTPTUTWQsGM1oeTUo49_hzwC5Yy9iEokrnS3z3WvtIo,5988
@@ -50,7 +50,7 @@ nv_ingest_api/internal/primitives/control_message_task.py,sha256=nWVB3QsP6p8BKwH
50
50
  nv_ingest_api/internal/primitives/ingest_control_message.py,sha256=8rA0UbPDSB3avReAKNxiUa_FCy7fIQpqk6tfmcYUibA,9879
51
51
  nv_ingest_api/internal/primitives/nim/__init__.py,sha256=-dFBTHQnMKV0yc5tfSqIT-rkJXKtpcmyUfTPs8TJAi8,339
52
52
  nv_ingest_api/internal/primitives/nim/default_values.py,sha256=W92XjfyeC6uuVxut6J7p00x1kpNsnXIDb97gSVytZJk,380
53
- nv_ingest_api/internal/primitives/nim/nim_client.py,sha256=kQAHWwZ6kjTVYZSfa0qRyIOFcqrhMe8LUygGtgzAly0,26321
53
+ nv_ingest_api/internal/primitives/nim/nim_client.py,sha256=AwOyED1Kt6F-pxJPi4kpb15ioeWHV5z5zTtJ9GliyYQ,28007
54
54
  nv_ingest_api/internal/primitives/nim/nim_model_interface.py,sha256=gWhyR33mIgEOYirq53WOk1bRl1SL0C_SVrM4w1-JmKU,4166
55
55
  nv_ingest_api/internal/primitives/nim/model_interface/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
56
56
  nv_ingest_api/internal/primitives/nim/model_interface/cached.py,sha256=b1HX-PY1ExW5V6pXC1ZiHdobeG_BmbPr3rBbVJef13s,11003
@@ -164,10 +164,10 @@ nv_ingest_api/util/string_processing/configuration.py,sha256=2HS08msccuPCT0fn_jf
164
164
  nv_ingest_api/util/string_processing/yaml.py,sha256=6SW2O6wbXRhGbhETMbtXjYCZn53HeCNOP6a96AaxlHs,1454
165
165
  nv_ingest_api/util/system/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
166
  nv_ingest_api/util/system/hardware_info.py,sha256=1UFM8XE6M3pgQcpbVsCsqDQ7Dj-zzptL-XRE-DEu9UA,27213
167
- nv_ingest_api-2025.10.8.dev20251008.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
167
+ nv_ingest_api-2025.10.9.dev20251009.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
168
168
  udfs/__init__.py,sha256=pXFqPgXIUqHDfj7SAR1Q19tt8KwGv_iMvhHyziz4AYM,205
169
- udfs/llm_summarizer_udf.py,sha256=sIMfcH4GRyciTKUtq4dmhd6fZmAp07X32irIC4k7nEI,7316
170
- nv_ingest_api-2025.10.8.dev20251008.dist-info/METADATA,sha256=lHvP6DR5gEfSPzyevDfnSrkPZl-5TB9S35V3GzJY7L4,14085
171
- nv_ingest_api-2025.10.8.dev20251008.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
172
- nv_ingest_api-2025.10.8.dev20251008.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
173
- nv_ingest_api-2025.10.8.dev20251008.dist-info/RECORD,,
169
+ udfs/llm_summarizer_udf.py,sha256=t_ZFoz0e03uECYcRw4IabRj0GBlwAoJkJn13NL2wbsI,7217
170
+ nv_ingest_api-2025.10.9.dev20251009.dist-info/METADATA,sha256=KB8EkNNQMTlk9Q7aDa09O4Q6DBQBCbBxJl0vtRoVbJY,14085
171
+ nv_ingest_api-2025.10.9.dev20251009.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
172
+ nv_ingest_api-2025.10.9.dev20251009.dist-info/top_level.txt,sha256=I1lseG9FF0CH93SPx4kFblsxFuv190cfzaas_CLNIiw,19
173
+ nv_ingest_api-2025.10.9.dev20251009.dist-info/RECORD,,
@@ -2,22 +2,40 @@
2
2
  """
3
3
  LLM Content Summarizer UDF for NV-Ingest Pipeline
4
4
 
5
- This UDF uses an LLM API to generate concise summaries
6
- of text content chunks, adding AI-generated summaries to the metadata for
7
- enhanced downstream processing and search capabilities.
5
+ This UDF uses an LLM to generate concise summaries of text content chunks. These summaries are added to the metadata
6
+ for enhanced downstream processing and search capabilities.
8
7
 
9
- Environment Variables:
8
+ These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
10
9
  - NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
11
10
  - LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
12
- - LLM_SUMMARIZATION_BASE_URL: API base URL (default: https://integrate.api.nvidia.com/v1)
13
- - LLM_SUMMARIZATION_TIMEOUT: API timeout in seconds (default: 60)
14
- - LLM_MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
15
- - LLM_MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
11
+ - LLM_BASE_URL: base URL (default: https://integrate.api.nvidia.com/v1)
12
+ - TIMEOUT: API timeout in seconds (default: 60)
13
+ - MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
14
+ - MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
15
+ TODO: Implement this
16
+ - NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
16
17
  """
17
18
 
18
- import os
19
19
  import logging
20
- from typing import Optional
20
+ import os
21
+ import time
22
+
23
+ # REMOVE BEFORE MERGING
24
+ # import yaml
25
+ # from pathlib import Path
26
+
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ PROMPT = """
31
+ Here are the contents from the first and last page of a document. Focus on the main purpose, key topics,
32
+ and important details. Just return the summary as a paragraph. Do not add special characters for formatting.
33
+ This summary will be used for document search and understanding.
34
+
35
+ [CONTENT]
36
+ {content}
37
+ [END CONTENT]
38
+ """
21
39
 
22
40
 
23
41
  def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage": # noqa: F821
@@ -27,13 +45,6 @@ def content_summarizer(control_message: "IngestControlMessage") -> "IngestContro
27
45
  This function processes text primitives and generates concise summaries using
28
46
  an LLM API, storing the results in the metadata's custom_content field.
29
47
 
30
- Features:
31
- - Flexible content detection across multiple metadata locations
32
- - Robust error handling with graceful fallbacks
33
- - Comprehensive logging for monitoring and debugging
34
- - Configurable content length thresholds
35
- - Safe metadata manipulation preserving existing data
36
-
37
48
  Parameters
38
49
  ----------
39
50
  control_message : IngestControlMessage
@@ -44,167 +55,150 @@ def content_summarizer(control_message: "IngestControlMessage") -> "IngestContro
44
55
  IngestControlMessage
45
56
  The modified control message with LLM summaries added to metadata
46
57
  """
47
- from openai import OpenAI
48
-
49
- logger = logging.getLogger(__name__)
50
58
  logger.info("UDF: Starting LLM content summarization")
51
59
 
52
- # Get configuration from environment
53
- api_key = os.getenv("NVIDIA_API_KEY", "")
60
+ api_key = os.getenv("NVIDIA_API_KEY")
54
61
  model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
55
62
  base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
56
- timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", "60"))
57
- min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", "50"))
58
- max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", "12000"))
63
+ min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", 50))
64
+ max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", 12000))
65
+ timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", 60))
66
+
67
+ stats = {
68
+ "skipped": False,
69
+ "failed": False,
70
+ "tokens": 0,
71
+ "duration": 0.0,
72
+ }
59
73
 
60
74
  if not api_key:
61
- logger.warning("NVIDIA_API_KEY not found, skipping summarization")
75
+ logger.error("NVIDIA_API_KEY not set. Skipping...")
62
76
  return control_message
63
77
 
64
- # Get the DataFrame payload
65
78
  df = control_message.payload()
66
- if df is None or len(df) == 0:
67
- logger.warning("No payload found in control message")
68
- return control_message
69
-
70
- logger.info(f"Processing {len(df)} rows for LLM summarization")
71
79
 
72
- # Initialize OpenAI client with error handling
73
- try:
74
- client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
75
- except Exception as e:
76
- logger.error(f"Failed to initialize OpenAI client: {e}")
80
+ if df is None or df.empty:
81
+ logger.warning("No payload found. Nothing to summarize.")
77
82
  return control_message
78
83
 
79
- # Stats for reporting
80
- stats = {"processed": 0, "summarized": 0, "skipped": 0, "failed": 0}
81
-
82
- # Process each row
83
- for idx, row in df.iterrows():
84
- stats["processed"] += 1
85
-
86
- try:
87
- # Extract content - be more flexible about where it comes from
88
- content = _extract_content(row, logger)
89
-
90
- if not content:
91
- stats["skipped"] += 1
92
- continue
93
-
94
- content = content.strip()
95
- if len(content) < min_content_length:
96
- stats["skipped"] += 1
97
- continue
98
-
99
- # Truncate if needed
100
- if len(content) > max_content_length:
101
- content = content[:max_content_length]
102
-
103
- # Generate summary
104
- summary = _generate_summary(client, content, model_name, logger)
105
-
106
- if summary:
107
- # Add to metadata
108
- _add_summary(df, idx, row, summary, model_name, logger)
109
- stats["summarized"] += 1
110
- else:
111
- stats["failed"] += 1
112
-
113
- except Exception as e:
114
- stats["failed"] += 1
115
- logger.error(f"Row {idx}: Error processing content: {e}")
116
-
117
- # Update the control message with modified DataFrame
118
- control_message.payload(df)
119
-
120
- logger.info(
121
- f"LLM summarization complete: {stats['summarized']}/{stats['processed']} documents summarized, "
122
- f"{stats['skipped']} skipped, {stats['failed']} failed"
84
+ # Select first and last chunk for summarization
85
+ # According to docs/docs/extraction/user_defined_functions.md#understanding-the-dataframe-payload
86
+ # the rows are not necessarily pages. they are chunks of data extracted from the document. in order to select
87
+ # pages, it must require parsing the payload to see which chunks correspond to which pages
88
+ if len(df) > 1:
89
+ # TODO: add feature to select N first and last chunks
90
+ df = df.iloc[[0, -1]]
91
+ else:
92
+ logger.info("Document has only one chunk")
93
+
94
+ # Combine all content into a single string
95
+ content_list = df.apply(
96
+ _extract_content,
97
+ axis=1,
98
+ min_content_length=min_content_length,
99
+ max_content_length=max_content_length,
100
+ stats=stats,
123
101
  )
102
+ content = " ".join(content_list)
124
103
 
125
- return control_message
104
+ # Nicely ask LLM to summarize content
105
+ summary, stats["duration"] = _generate_llm_summary(content, model_name, base_url, api_key, timeout)
126
106
 
107
+ stats["failed"] = summary is None
108
+ if not stats["failed"]:
109
+ stats["tokens"] = _estimate_tokens(content)
110
+ logger.info("Summarized %d tokens in %f seconds using %s", stats["tokens"], stats["duration"], model_name)
111
+ _store_summary(df, summary, model_name)
127
112
 
128
- def _extract_content(row, logger) -> Optional[str]:
129
- """Extract text content from row, trying multiple locations."""
130
- content = ""
113
+ # Update the control message with modified DataFrame
114
+ control_message.payload(df)
115
+ else:
116
+ logger.warning("%s failed to summarize content", model_name)
131
117
 
132
- # Try different locations for content
133
- if isinstance(row.get("metadata"), dict):
134
- metadata = row["metadata"]
118
+ return control_message
135
119
 
136
- # Primary location: metadata.content
137
- content = metadata.get("content", "")
138
120
 
139
- # If no content, try other locations
140
- if not content:
141
- # Try in text_metadata
142
- text_metadata = metadata.get("text_metadata", {})
143
- content = text_metadata.get("text", "") or text_metadata.get("content", "")
121
+ def _extract_content(row, stats: dict, min_content_length: int = 50, max_content_length: int = 12000) -> str | None:
122
+ """Extract text content from row"""
123
+ metadata = row.get("metadata")
144
124
 
145
- # Try top-level content field
146
- if not content:
147
- content = row.get("content", "")
125
+ if isinstance(metadata, dict):
126
+ content = metadata.get("content")
127
+ if content is not None:
128
+ content = content.strip()
129
+ if len(content) < min_content_length:
130
+ stats["skipped"] = True
131
+ logger.warning(f"Content less than min={min_content_length}. Skipping...")
132
+ content = ""
133
+ elif len(content) > max_content_length:
134
+ logger.warning(f"Truncating content to {max_content_length} characters")
135
+ content = content[:max_content_length]
136
+ else:
137
+ stats["skipped"] = True
138
+ content = ""
148
139
 
149
- if not content:
150
- return None
140
+ else:
141
+ stats["skipped"] = True
142
+ logger.warning("No metadata found. Skipping...")
143
+ content = ""
151
144
 
152
145
  return content
153
146
 
154
147
 
155
- def _generate_summary(client, content: str, model_name: str, logger) -> Optional[str]:
156
- """Generate summary with robust error handling."""
157
- prompt = f"""Please provide a comprehensive 3-4 sentence summary of the following document:
158
-
159
- {content}
160
-
161
- Focus on the main purpose, key topics, and important details.
162
- This summary will be used for document search and understanding.
163
-
164
- Summary:"""
148
+ def _generate_llm_summary(
149
+ content: str,
150
+ model_name: str,
151
+ base_url: str,
152
+ api_key: str,
153
+ timeout: int,
154
+ ) -> tuple[str | None, float]:
155
+ """Ask an LLM to summarize content extracted from doc."""
165
156
 
157
+ start_time = time.time()
166
158
  try:
159
+ from openai import OpenAI
160
+
161
+ client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
162
+ start_time = time.time()
167
163
  completion = client.chat.completions.create(
168
164
  model=model_name,
169
- messages=[{"role": "user", "content": prompt}],
165
+ messages=[{"role": "user", "content": PROMPT.format(content=content)}],
170
166
  max_tokens=400, # Increased for more comprehensive summaries
171
167
  temperature=0.7,
172
168
  )
169
+ duration = time.time() - start_time
173
170
 
174
- if completion.choices and len(completion.choices) > 0:
171
+ if completion.choices:
175
172
  summary = completion.choices[0].message.content.strip()
176
- return summary
177
- else:
178
- return None
173
+ return summary, duration
174
+ return None, duration
179
175
 
180
176
  except Exception as e:
181
177
  logger.error(f"API call failed: {e}")
182
- return None
178
+ # TODO: GitHub Thread
179
+ # Reviewers, tell me if this is a bad idea.
180
+ # I think the convention is to return timestamp for time even if it fails
181
+ return None, time.time() - start_time
183
182
 
184
183
 
185
- def _add_summary(df, idx: int, row, summary: str, model_name: str, logger):
186
- """Add summary to metadata with safe handling."""
187
- try:
188
- # Get current metadata or create new dict - handle None case properly
189
- existing_metadata = row.get("metadata")
190
- if existing_metadata is not None and isinstance(existing_metadata, dict):
191
- metadata = dict(existing_metadata) # Create a copy
192
- else:
193
- metadata = {}
184
+ def _store_summary(df, summary: str, model_name: str):
185
+ """Add summary to metadata and store in df"""
186
+ # hardcoded heuristic to store everything on chunk 0's metadata
187
+ row_0 = df.iloc[0]
194
188
 
195
- # Ensure custom_content exists
196
- if "custom_content" not in metadata or metadata["custom_content"] is None:
197
- metadata["custom_content"] = {}
189
+ # this is a reference to a dictionary that is stored in the dataframe
190
+ # and is modified in place
191
+ metadata = row_0.get("metadata")
198
192
 
199
- # Add LLM summary
200
- metadata["custom_content"]["llm_summary"] = {"summary": summary, "model": model_name}
193
+ if metadata.get("custom_content") is None:
194
+ metadata["custom_content"] = {}
195
+ metadata["custom_content"]["llm_summarizer_udf"] = {"summary": summary, "model": model_name}
201
196
 
202
- # Update the DataFrame at the specific index
203
- try:
204
- df.at[idx, "metadata"] = metadata
205
- except Exception:
206
- # Alternative approach: update the original row reference
207
- df.iloc[idx]["metadata"] = metadata
208
197
 
209
- except Exception as e:
210
- logger.error(f"Failed to add summary to row {idx}: {e}")
198
+ def _estimate_tokens(text: str) -> int:
199
+ """Rough estimate (~4 characters per token)"""
200
+ return len(text) // 4
201
+
202
+
203
+ def _safe_model_name(name: str) -> str:
204
+ return name.replace("/", "__").replace("-", "_")