nv-ingest-api 26.1.0rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (177) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +218 -0
  3. nv_ingest_api/interface/extract.py +977 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +200 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +186 -0
  8. nv_ingest_api/internal/__init__.py +0 -0
  9. nv_ingest_api/internal/enums/__init__.py +3 -0
  10. nv_ingest_api/internal/enums/common.py +550 -0
  11. nv_ingest_api/internal/extract/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  13. nv_ingest_api/internal/extract/audio/audio_extraction.py +202 -0
  14. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  15. nv_ingest_api/internal/extract/docx/docx_extractor.py +232 -0
  16. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +127 -0
  19. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +971 -0
  20. nv_ingest_api/internal/extract/html/__init__.py +3 -0
  21. nv_ingest_api/internal/extract/html/html_extractor.py +84 -0
  22. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/chart_extractor.py +375 -0
  24. nv_ingest_api/internal/extract/image/image_extractor.py +208 -0
  25. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  26. nv_ingest_api/internal/extract/image/image_helpers/common.py +433 -0
  27. nv_ingest_api/internal/extract/image/infographic_extractor.py +290 -0
  28. nv_ingest_api/internal/extract/image/ocr_extractor.py +407 -0
  29. nv_ingest_api/internal/extract/image/table_extractor.py +391 -0
  30. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  31. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  32. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  33. nv_ingest_api/internal/extract/pdf/engines/llama.py +246 -0
  34. nv_ingest_api/internal/extract/pdf/engines/nemotron_parse.py +598 -0
  35. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +166 -0
  36. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +652 -0
  37. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  38. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  39. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  40. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  41. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  42. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +968 -0
  43. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +210 -0
  44. nv_ingest_api/internal/meta/__init__.py +3 -0
  45. nv_ingest_api/internal/meta/udf.py +232 -0
  46. nv_ingest_api/internal/mutate/__init__.py +3 -0
  47. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  48. nv_ingest_api/internal/mutate/filter.py +133 -0
  49. nv_ingest_api/internal/primitives/__init__.py +0 -0
  50. nv_ingest_api/internal/primitives/control_message_task.py +16 -0
  51. nv_ingest_api/internal/primitives/ingest_control_message.py +307 -0
  52. nv_ingest_api/internal/primitives/nim/__init__.py +9 -0
  53. nv_ingest_api/internal/primitives/nim/default_values.py +14 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +338 -0
  59. nv_ingest_api/internal/primitives/nim/model_interface/nemotron_parse.py +239 -0
  60. nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +776 -0
  61. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  62. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +129 -0
  63. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +177 -0
  64. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1681 -0
  65. nv_ingest_api/internal/primitives/nim/nim_client.py +801 -0
  66. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +126 -0
  67. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  68. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  69. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  70. nv_ingest_api/internal/primitives/tracing/tagging.py +288 -0
  71. nv_ingest_api/internal/schemas/__init__.py +3 -0
  72. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  73. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +133 -0
  74. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +144 -0
  75. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +129 -0
  76. nv_ingest_api/internal/schemas/extract/extract_html_schema.py +34 -0
  77. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +126 -0
  78. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +137 -0
  79. nv_ingest_api/internal/schemas/extract/extract_ocr_schema.py +137 -0
  80. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +220 -0
  81. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +128 -0
  82. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +137 -0
  83. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +37 -0
  85. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  86. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  87. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  88. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  89. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +355 -0
  90. nv_ingest_api/internal/schemas/meta/metadata_schema.py +394 -0
  91. nv_ingest_api/internal/schemas/meta/udf.py +23 -0
  92. nv_ingest_api/internal/schemas/mixins.py +39 -0
  93. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  94. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  95. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  96. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  97. nv_ingest_api/internal/schemas/store/store_image_schema.py +45 -0
  98. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  99. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +36 -0
  100. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  101. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +48 -0
  102. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +24 -0
  103. nv_ingest_api/internal/store/__init__.py +3 -0
  104. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  105. nv_ingest_api/internal/store/image_upload.py +251 -0
  106. nv_ingest_api/internal/transform/__init__.py +3 -0
  107. nv_ingest_api/internal/transform/caption_image.py +219 -0
  108. nv_ingest_api/internal/transform/embed_text.py +702 -0
  109. nv_ingest_api/internal/transform/split_text.py +182 -0
  110. nv_ingest_api/util/__init__.py +3 -0
  111. nv_ingest_api/util/control_message/__init__.py +0 -0
  112. nv_ingest_api/util/control_message/validators.py +47 -0
  113. nv_ingest_api/util/converters/__init__.py +0 -0
  114. nv_ingest_api/util/converters/bytetools.py +78 -0
  115. nv_ingest_api/util/converters/containers.py +65 -0
  116. nv_ingest_api/util/converters/datetools.py +90 -0
  117. nv_ingest_api/util/converters/dftools.py +127 -0
  118. nv_ingest_api/util/converters/formats.py +64 -0
  119. nv_ingest_api/util/converters/type_mappings.py +27 -0
  120. nv_ingest_api/util/dataloader/__init__.py +9 -0
  121. nv_ingest_api/util/dataloader/dataloader.py +409 -0
  122. nv_ingest_api/util/detectors/__init__.py +5 -0
  123. nv_ingest_api/util/detectors/language.py +38 -0
  124. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  125. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  126. nv_ingest_api/util/exception_handlers/decorators.py +429 -0
  127. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  128. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  129. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  130. nv_ingest_api/util/image_processing/__init__.py +5 -0
  131. nv_ingest_api/util/image_processing/clustering.py +260 -0
  132. nv_ingest_api/util/image_processing/processing.py +177 -0
  133. nv_ingest_api/util/image_processing/table_and_chart.py +504 -0
  134. nv_ingest_api/util/image_processing/transforms.py +850 -0
  135. nv_ingest_api/util/imports/__init__.py +3 -0
  136. nv_ingest_api/util/imports/callable_signatures.py +108 -0
  137. nv_ingest_api/util/imports/dynamic_resolvers.py +158 -0
  138. nv_ingest_api/util/introspection/__init__.py +3 -0
  139. nv_ingest_api/util/introspection/class_inspect.py +145 -0
  140. nv_ingest_api/util/introspection/function_inspect.py +65 -0
  141. nv_ingest_api/util/logging/__init__.py +0 -0
  142. nv_ingest_api/util/logging/configuration.py +102 -0
  143. nv_ingest_api/util/logging/sanitize.py +84 -0
  144. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  145. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  146. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  147. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  148. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  149. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +455 -0
  150. nv_ingest_api/util/metadata/__init__.py +5 -0
  151. nv_ingest_api/util/metadata/aggregators.py +516 -0
  152. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  153. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +200 -0
  154. nv_ingest_api/util/nim/__init__.py +161 -0
  155. nv_ingest_api/util/pdf/__init__.py +3 -0
  156. nv_ingest_api/util/pdf/pdfium.py +428 -0
  157. nv_ingest_api/util/schema/__init__.py +3 -0
  158. nv_ingest_api/util/schema/schema_validator.py +10 -0
  159. nv_ingest_api/util/service_clients/__init__.py +3 -0
  160. nv_ingest_api/util/service_clients/client_base.py +86 -0
  161. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  162. nv_ingest_api/util/service_clients/redis/__init__.py +3 -0
  163. nv_ingest_api/util/service_clients/redis/redis_client.py +983 -0
  164. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  165. nv_ingest_api/util/service_clients/rest/rest_client.py +595 -0
  166. nv_ingest_api/util/string_processing/__init__.py +51 -0
  167. nv_ingest_api/util/string_processing/configuration.py +682 -0
  168. nv_ingest_api/util/string_processing/yaml.py +109 -0
  169. nv_ingest_api/util/system/__init__.py +0 -0
  170. nv_ingest_api/util/system/hardware_info.py +594 -0
  171. nv_ingest_api-26.1.0rc4.dist-info/METADATA +237 -0
  172. nv_ingest_api-26.1.0rc4.dist-info/RECORD +177 -0
  173. nv_ingest_api-26.1.0rc4.dist-info/WHEEL +5 -0
  174. nv_ingest_api-26.1.0rc4.dist-info/licenses/LICENSE +201 -0
  175. nv_ingest_api-26.1.0rc4.dist-info/top_level.txt +2 -0
  176. udfs/__init__.py +5 -0
  177. udfs/llm_summarizer_udf.py +259 -0
@@ -0,0 +1,259 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LLM Content Summarizer UDF for NV-Ingest Pipeline
4
+
5
+ This UDF uses an LLM to generate concise summaries of text content chunks. These summaries are added to the metadata
6
+ for enhanced downstream processing and search capabilities.
7
+
8
+ By default, this uses NVIDIA BUILD-hosted Nemotron-mini-4b-instruct as an example, but you can customize it to use any
9
+ OpenAI-compatible endpoint (other NVIDIA BUILD models, local LLMs via Ollama/vLLM, self-hosted NIM, etc.) by setting
10
+ LLM_SUMMARIZATION_BASE_URL and LLM_SUMMARIZATION_MODEL.
11
+
12
+ Environment variables (can be treated as kwargs):
13
+ - NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required for hosted endpoints)
14
+ - LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/nemotron-mini-4b-instruct)
15
+ - LLM_SUMMARIZATION_BASE_URL: Base URL for OpenAI-compatible API (default: https://integrate.api.nvidia.com/v1)
16
+ - LLM_SUMMARIZATION_TIMEOUT: API timeout in seconds (default: 60)
17
+ - LLM_MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
18
+ - LLM_MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
19
+
20
+ More info can be found in `examples/udfs/README.md`
21
+ """
22
+
23
+ import logging
24
+ import os
25
+ import time
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ PROMPT = """
30
+ Based on the contents from the first and last page of a document below, provide a single sentence summary that \
31
+ captures the main purpose and key topics. Do not add special characters for formatting.
32
+
33
+ [CONTENT]
34
+ {content}
35
+ [END CONTENT]
36
+ """
37
+
38
+
39
+ def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage": # noqa: F821
40
+ """
41
+ UDF function that adds LLM-generated summaries to text content chunks.
42
+
43
+ This function processes text primitives and generates concise summaries using
44
+ an LLM API, storing the results in the metadata's custom_content field.
45
+
46
+ Parameters
47
+ ----------
48
+ control_message : IngestControlMessage
49
+ The control message containing the DataFrame payload with text content
50
+
51
+ Returns
52
+ -------
53
+ IngestControlMessage
54
+ The modified control message with LLM summaries added to metadata
55
+ """
56
+ udf_start_time = time.time()
57
+
58
+ # Load configuration
59
+ api_key = os.getenv("NVIDIA_API_KEY") or os.getenv("NGC_API_KEY") # Using NGC_API_KEY if NVIDIA_API_KEY is not set
60
+ model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/nemotron-mini-4b-instruct")
61
+ base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
62
+ min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", 50))
63
+ max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", 12000))
64
+ timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", 60))
65
+
66
+ stats = {
67
+ "processed": 0,
68
+ "summarized": 0,
69
+ "skipped": 0,
70
+ "failed": 0,
71
+ "tokens": 0,
72
+ }
73
+
74
+ logger.info(f"Configuration: model={model_name}, base_url={base_url}")
75
+ logger.info(
76
+ f"Configuration: timeout={timeout}s, min_content={min_content_length}, max_content={max_content_length}"
77
+ )
78
+
79
+ if not api_key:
80
+ logger.error("NVIDIA_API_KEY not set - skipping LLM summarization")
81
+ return control_message
82
+
83
+ df = control_message.payload()
84
+ if df is None or df.empty:
85
+ logger.warning("Empty payload - skipping LLM summarization")
86
+ return control_message
87
+
88
+ # Extract document name
89
+ doc_name = _extract_document_name(df)
90
+ logger.info(f"LLM summarization starting: {doc_name} ({len(df)} chunks, model={model_name})")
91
+
92
+ # Save original dataframe to preserve all chunks
93
+ original_df = df.copy()
94
+
95
+ extraction_start = time.time()
96
+ if len(df) > 1:
97
+ # Select first and last chunk for summarization
98
+ # TODO: add feature to select N first and last chunks
99
+ # According to docs/docs/extraction/user_defined_functions.md#understanding-the-dataframe-payload
100
+ # the rows are not necessarily pages. they are chunks of data extracted from the document. in order to select
101
+ # pages, it must require parsing the payload to see which chunks correspond to which pages and then selecting
102
+ # from there
103
+ logger.info(f"Selecting first and last chunks (out of {len(df)} total) for summarization")
104
+ selected_df = df.iloc[[0, -1]]
105
+ else:
106
+ logger.info("Document has only one chunk")
107
+ selected_df = df
108
+
109
+ # Combine all content into a single string
110
+ logger.info("Extracting and combining content from selected chunks...")
111
+ content = " ".join(
112
+ selected_df.apply(
113
+ _extract_content,
114
+ axis=1,
115
+ min_content_length=min_content_length,
116
+ max_content_length=max_content_length,
117
+ stats=stats,
118
+ )
119
+ )
120
+ stats["tokens"] = _estimate_tokens(content)
121
+ extraction_time = time.time() - extraction_start
122
+ logger.info(
123
+ f"Content extraction completed: {len(content)} characters, "
124
+ f"~{stats['tokens']} tokens (took {extraction_time:.2f}s)"
125
+ )
126
+
127
+ logger.info(f"Calling LLM API ({model_name}) for summarization...")
128
+ summary, llm_duration = _generate_llm_summary(content, model_name, base_url, api_key, timeout)
129
+
130
+ if summary:
131
+ tokens_per_sec = stats["tokens"] / llm_duration if llm_duration > 0 else 0
132
+ logger.info(
133
+ f"LLM API call completed: duration={llm_duration:.2f}s, "
134
+ f"tokens={stats['tokens']}, throughput={tokens_per_sec:.1f} tokens/s"
135
+ )
136
+ logger.info(
137
+ f"Generated summary ({len(summary)} chars): {summary[:100]}..."
138
+ if len(summary) > 100
139
+ else f"Generated summary: {summary}"
140
+ )
141
+ else:
142
+ logger.error(f"LLM API call failed (took {llm_duration:.2f}s)")
143
+
144
+ # Store summary in chunk 0 of the original dataframe (preserves all chunks)
145
+ _store_summary(original_df, summary, model_name)
146
+
147
+ # Calculate total UDF time
148
+ udf_total_time = time.time() - udf_start_time
149
+
150
+ # Log summary
151
+ logger.info("=" * 80)
152
+ logger.info(f"LLM Summarization Complete - Document: {doc_name}")
153
+ logger.info(f" Status: {'SUCCESS' if summary else 'FAILED'}")
154
+ logger.info(f" Model: {model_name}")
155
+ logger.info(f" Content extraction time: {extraction_time:.2f}s")
156
+ logger.info(f" LLM API call time: {llm_duration:.2f}s")
157
+ logger.info(f" Total UDF time: {udf_total_time:.2f}s")
158
+ logger.info(f" Chunks preserved: {len(original_df)} (all chunks kept)")
159
+ if summary and llm_duration > 0:
160
+ logger.info(f" Throughput: {stats['tokens']/llm_duration:.1f} tokens/s")
161
+ logger.info("=" * 80)
162
+
163
+ # Update the control message with modified DataFrame (all chunks preserved)
164
+ control_message.payload(original_df)
165
+ return control_message
166
+
167
+
168
+ def _extract_content(row, min_content_length: int, max_content_length: int, stats: dict) -> str:
169
+ """Extract text content from row"""
170
+ metadata = row.get("metadata")
171
+ content = ""
172
+
173
+ if isinstance(metadata, dict):
174
+ content = metadata.get("content")
175
+ if content is not None:
176
+ content = content.strip()
177
+ if len(content) < min_content_length:
178
+ stats["skipped"] += 1
179
+ return ""
180
+ elif len(content) > max_content_length:
181
+ logger.debug(f"Truncating content to {max_content_length} characters")
182
+ content = content[:max_content_length]
183
+ else:
184
+ stats["skipped"] += 1
185
+
186
+ return content
187
+
188
+
189
+ def _generate_llm_summary(
190
+ content: str,
191
+ model_name: str,
192
+ base_url: str,
193
+ api_key: str,
194
+ timeout: int,
195
+ ) -> tuple[str | None, float]:
196
+ """
197
+ Generate summary using LLM API.
198
+
199
+ Returns
200
+ -------
201
+ tuple[str | None, float]
202
+ Summary text (or None if failed) and duration in seconds
203
+ """
204
+ start_time = time.time()
205
+
206
+ try:
207
+ from openai import OpenAI
208
+
209
+ client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
210
+
211
+ completion = client.chat.completions.create(
212
+ model=model_name,
213
+ messages=[{"role": "user", "content": PROMPT.format(content=content)}],
214
+ max_tokens=400,
215
+ temperature=0.7,
216
+ )
217
+
218
+ duration = time.time() - start_time
219
+
220
+ if completion.choices:
221
+ summary = completion.choices[0].message.content.strip()
222
+ return summary, duration
223
+
224
+ logger.warning("LLM returned no completion choices")
225
+ return None, duration
226
+
227
+ except Exception as e:
228
+ duration = time.time() - start_time
229
+ logger.error(f"LLM API call failed ({duration:.2f}s): {type(e).__name__}: {str(e)[:200]}")
230
+ return None, duration
231
+
232
+
233
+ def _extract_document_name(df) -> str:
234
+ """Extract source document name from dataframe metadata"""
235
+ try:
236
+ if len(df) > 0 and "metadata" in df.iloc[0]:
237
+ metadata = df.iloc[0].get("metadata", {})
238
+ if isinstance(metadata, dict):
239
+ source_metadata = metadata.get("source_metadata", {})
240
+ if isinstance(source_metadata, dict):
241
+ return source_metadata.get("source_name", "Unknown")
242
+ except Exception as e:
243
+ logger.debug(f"Could not extract document name: {e}")
244
+ return "Unknown"
245
+
246
+
247
+ def _store_summary(df, summary: str, model_name: str):
248
+ """Add summary to metadata and store in df"""
249
+ row_0 = df.iloc[0]
250
+ metadata = row_0.get("metadata")
251
+
252
+ if metadata.get("custom_content") is None:
253
+ metadata["custom_content"] = {}
254
+ metadata["custom_content"]["llm_summarizer_udf"] = {"summary": summary, "model": model_name}
255
+
256
+
257
+ def _estimate_tokens(text: str) -> int:
258
+ """Rough estimate (~4 characters per token)"""
259
+ return len(text) // 4