nv-ingest-api 2025.4.20.dev20250420__py3-none-any.whl → 2025.4.22.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +3 -0
  2. nv_ingest_api/interface/__init__.py +215 -0
  3. nv_ingest_api/interface/extract.py +972 -0
  4. nv_ingest_api/interface/mutate.py +154 -0
  5. nv_ingest_api/interface/store.py +218 -0
  6. nv_ingest_api/interface/transform.py +382 -0
  7. nv_ingest_api/interface/utility.py +200 -0
  8. nv_ingest_api/internal/enums/__init__.py +3 -0
  9. nv_ingest_api/internal/enums/common.py +494 -0
  10. nv_ingest_api/internal/extract/__init__.py +3 -0
  11. nv_ingest_api/internal/extract/audio/__init__.py +3 -0
  12. nv_ingest_api/internal/extract/audio/audio_extraction.py +149 -0
  13. nv_ingest_api/internal/extract/docx/__init__.py +5 -0
  14. nv_ingest_api/internal/extract/docx/docx_extractor.py +205 -0
  15. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  16. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +3 -0
  17. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +122 -0
  18. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +895 -0
  19. nv_ingest_api/internal/extract/image/__init__.py +3 -0
  20. nv_ingest_api/internal/extract/image/chart_extractor.py +353 -0
  21. nv_ingest_api/internal/extract/image/image_extractor.py +204 -0
  22. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +3 -0
  23. nv_ingest_api/internal/extract/image/image_helpers/common.py +403 -0
  24. nv_ingest_api/internal/extract/image/infographic_extractor.py +253 -0
  25. nv_ingest_api/internal/extract/image/table_extractor.py +344 -0
  26. nv_ingest_api/internal/extract/pdf/__init__.py +3 -0
  27. nv_ingest_api/internal/extract/pdf/engines/__init__.py +19 -0
  28. nv_ingest_api/internal/extract/pdf/engines/adobe.py +484 -0
  29. nv_ingest_api/internal/extract/pdf/engines/llama.py +243 -0
  30. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +597 -0
  31. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +146 -0
  32. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +603 -0
  33. nv_ingest_api/internal/extract/pdf/engines/tika.py +96 -0
  34. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +426 -0
  35. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +74 -0
  36. nv_ingest_api/internal/extract/pptx/__init__.py +5 -0
  37. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  38. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +799 -0
  39. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +187 -0
  40. nv_ingest_api/internal/mutate/__init__.py +3 -0
  41. nv_ingest_api/internal/mutate/deduplicate.py +110 -0
  42. nv_ingest_api/internal/mutate/filter.py +133 -0
  43. nv_ingest_api/internal/primitives/__init__.py +0 -0
  44. nv_ingest_api/{primitives → internal/primitives}/control_message_task.py +4 -0
  45. nv_ingest_api/{primitives → internal/primitives}/ingest_control_message.py +5 -2
  46. nv_ingest_api/internal/primitives/nim/__init__.py +8 -0
  47. nv_ingest_api/internal/primitives/nim/default_values.py +15 -0
  48. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +3 -0
  49. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +274 -0
  50. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +56 -0
  51. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +270 -0
  52. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +275 -0
  53. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +238 -0
  54. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +462 -0
  55. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +367 -0
  56. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +132 -0
  57. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +152 -0
  58. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +1400 -0
  59. nv_ingest_api/internal/primitives/nim/nim_client.py +344 -0
  60. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +81 -0
  61. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  62. nv_ingest_api/internal/primitives/tracing/latency.py +69 -0
  63. nv_ingest_api/internal/primitives/tracing/logging.py +96 -0
  64. nv_ingest_api/internal/primitives/tracing/tagging.py +197 -0
  65. nv_ingest_api/internal/schemas/__init__.py +3 -0
  66. nv_ingest_api/internal/schemas/extract/__init__.py +3 -0
  67. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +130 -0
  68. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +135 -0
  69. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +124 -0
  70. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +124 -0
  71. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +128 -0
  72. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +218 -0
  73. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +124 -0
  74. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +129 -0
  75. nv_ingest_api/internal/schemas/message_brokers/__init__.py +3 -0
  76. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +23 -0
  77. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +34 -0
  78. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +19 -0
  79. nv_ingest_api/internal/schemas/meta/__init__.py +3 -0
  80. nv_ingest_api/internal/schemas/meta/base_model_noext.py +11 -0
  81. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +237 -0
  82. nv_ingest_api/internal/schemas/meta/metadata_schema.py +221 -0
  83. nv_ingest_api/internal/schemas/mutate/__init__.py +3 -0
  84. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +16 -0
  85. nv_ingest_api/internal/schemas/store/__init__.py +3 -0
  86. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +28 -0
  87. nv_ingest_api/internal/schemas/store/store_image_schema.py +30 -0
  88. nv_ingest_api/internal/schemas/transform/__init__.py +3 -0
  89. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +15 -0
  90. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +17 -0
  91. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +25 -0
  92. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +22 -0
  93. nv_ingest_api/internal/store/__init__.py +3 -0
  94. nv_ingest_api/internal/store/embed_text_upload.py +236 -0
  95. nv_ingest_api/internal/store/image_upload.py +232 -0
  96. nv_ingest_api/internal/transform/__init__.py +3 -0
  97. nv_ingest_api/internal/transform/caption_image.py +205 -0
  98. nv_ingest_api/internal/transform/embed_text.py +496 -0
  99. nv_ingest_api/internal/transform/split_text.py +157 -0
  100. nv_ingest_api/util/__init__.py +0 -0
  101. nv_ingest_api/util/control_message/__init__.py +0 -0
  102. nv_ingest_api/util/control_message/validators.py +47 -0
  103. nv_ingest_api/util/converters/__init__.py +0 -0
  104. nv_ingest_api/util/converters/bytetools.py +78 -0
  105. nv_ingest_api/util/converters/containers.py +65 -0
  106. nv_ingest_api/util/converters/datetools.py +90 -0
  107. nv_ingest_api/util/converters/dftools.py +127 -0
  108. nv_ingest_api/util/converters/formats.py +64 -0
  109. nv_ingest_api/util/converters/type_mappings.py +27 -0
  110. nv_ingest_api/util/detectors/__init__.py +5 -0
  111. nv_ingest_api/util/detectors/language.py +38 -0
  112. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  113. nv_ingest_api/util/exception_handlers/converters.py +72 -0
  114. nv_ingest_api/util/exception_handlers/decorators.py +223 -0
  115. nv_ingest_api/util/exception_handlers/detectors.py +74 -0
  116. nv_ingest_api/util/exception_handlers/pdf.py +116 -0
  117. nv_ingest_api/util/exception_handlers/schemas.py +68 -0
  118. nv_ingest_api/util/image_processing/__init__.py +5 -0
  119. nv_ingest_api/util/image_processing/clustering.py +260 -0
  120. nv_ingest_api/util/image_processing/processing.py +179 -0
  121. nv_ingest_api/util/image_processing/table_and_chart.py +449 -0
  122. nv_ingest_api/util/image_processing/transforms.py +407 -0
  123. nv_ingest_api/util/logging/__init__.py +0 -0
  124. nv_ingest_api/util/logging/configuration.py +31 -0
  125. nv_ingest_api/util/message_brokers/__init__.py +3 -0
  126. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +9 -0
  127. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +465 -0
  128. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +71 -0
  129. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +451 -0
  130. nv_ingest_api/util/metadata/__init__.py +5 -0
  131. nv_ingest_api/util/metadata/aggregators.py +469 -0
  132. nv_ingest_api/util/multi_processing/__init__.py +8 -0
  133. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +194 -0
  134. nv_ingest_api/util/nim/__init__.py +56 -0
  135. nv_ingest_api/util/pdf/__init__.py +3 -0
  136. nv_ingest_api/util/pdf/pdfium.py +427 -0
  137. nv_ingest_api/util/schema/__init__.py +0 -0
  138. nv_ingest_api/util/schema/schema_validator.py +10 -0
  139. nv_ingest_api/util/service_clients/__init__.py +3 -0
  140. nv_ingest_api/util/service_clients/client_base.py +86 -0
  141. nv_ingest_api/util/service_clients/kafka/__init__.py +3 -0
  142. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  143. nv_ingest_api/util/service_clients/redis/redis_client.py +823 -0
  144. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  145. nv_ingest_api/util/service_clients/rest/rest_client.py +531 -0
  146. nv_ingest_api/util/string_processing/__init__.py +51 -0
  147. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/METADATA +1 -1
  148. nv_ingest_api-2025.4.22.dev20250422.dist-info/RECORD +152 -0
  149. nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +0 -9
  150. /nv_ingest_api/{primitives → internal}/__init__.py +0 -0
  151. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/WHEEL +0 -0
  152. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.20.dev20250420.dist-info → nv_ingest_api-2025.4.22.dev20250422.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,205 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import base64
7
+ import functools
8
+ import io
9
+ import logging
10
+ from typing import Optional, Dict, Any, Union
11
+
12
+ import pandas as pd
13
+ from pydantic import BaseModel
14
+
15
+ from nv_ingest_api.internal.extract.docx.engines.docxreader_helpers.docx_helper import python_docx
16
+ from nv_ingest_api.internal.schemas.extract.extract_docx_schema import DocxExtractorSchema
17
+ from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _prepare_task_props(
23
+ task_config: Union[Dict[str, Any], BaseModel], base64_row: pd.Series
24
+ ) -> (Dict[str, Any], Optional[str]):
25
+ """
26
+ Prepares the task properties by converting a Pydantic model to a dictionary (if needed)
27
+ and injecting row-specific data.
28
+
29
+ Parameters
30
+ ----------
31
+ task_config : Union[Dict[str, Any], BaseModel]
32
+ A dictionary or Pydantic model containing instructions and parameters for extraction.
33
+ base64_row : pd.Series
34
+ A Series representing a row from the DataFrame that contains at least the "content"
35
+ key and optionally "source_id".
36
+
37
+ Returns
38
+ -------
39
+ Tuple[Dict[str, Any], Optional[str]]
40
+ A tuple where the first element is the prepared task properties dictionary with the key
41
+ "row_data" added under its "params" key, and the second element is the source_id (if present),
42
+ otherwise None.
43
+ """
44
+
45
+ if isinstance(task_config, BaseModel):
46
+ task_config = task_config.model_dump()
47
+ else:
48
+ task_config = dict(task_config)
49
+
50
+ # Extract all row data except the "content" field.
51
+ row_data = base64_row.drop(labels=["content"], errors="ignore")
52
+ if "params" not in task_config:
53
+ task_config["params"] = {}
54
+
55
+ task_config["params"]["row_data"] = row_data
56
+
57
+ source_id = base64_row.get("source_id", None)
58
+
59
+ return task_config, source_id
60
+
61
+
62
+ @unified_exception_handler
63
+ def _decode_and_extract_from_docx(
64
+ base64_row: pd.Series,
65
+ task_config: Union[Dict[str, Any], BaseModel],
66
+ extraction_config: Any,
67
+ execution_trace_log: Optional[Dict[str, Any]] = None,
68
+ ) -> Any:
69
+ """
70
+ Decodes base64 content from a DataFrame row and extracts data using the specified extraction method.
71
+
72
+ The function decodes the base64-encoded content from the "content" key in the row, prepares
73
+ extraction parameters (including additional row data and configuration), and invokes the extraction
74
+ function from the docx module. If an error occurs, an exception tag is returned.
75
+
76
+ Parameters
77
+ ----------
78
+ base64_row : pd.Series
79
+ A Series containing the base64-encoded content under the key "content" and optionally a "source_id".
80
+ task_config : Union[Dict[str, Any], BaseModel]
81
+ A dictionary or Pydantic model containing extraction instructions and parameters.
82
+ Expected to have a "params" key for additional parameters and optionally a "method" key specifying
83
+ the extraction method.
84
+ extraction_config : Any
85
+ A configuration object that contains extraction-specific settings, such as `docx_extraction_config`.
86
+ execution_trace_log : Optional[Dict[str, Any]], default=None
87
+ A dictionary containing trace information for debugging or logging.
88
+ default : str, optional
89
+ The default extraction method to use if the specified method is not available (default is "python_docx").
90
+
91
+ Returns
92
+ -------
93
+ Any
94
+ The extracted data, or an exception tag if extraction fails.
95
+
96
+ Raises
97
+ ------
98
+ Exception
99
+ If an unhandled exception occurs during extraction, it is logged and a tagged error is returned.
100
+ """
101
+ # Prepare task properties and extract source_id
102
+ task_config, source_id = _prepare_task_props(task_config, base64_row)
103
+
104
+ # Retrieve base64 content and decode it into a byte stream.
105
+ base64_content: str = base64_row["content"]
106
+ doc_bytes: bytes = base64.b64decode(base64_content)
107
+ doc_stream: io.BytesIO = io.BytesIO(doc_bytes)
108
+
109
+ extract_params: Dict[str, Any] = task_config.get("params", {})
110
+
111
+ # Extract required boolean flags from params.
112
+ try:
113
+ extract_text = extract_params.pop("extract_text", False)
114
+ extract_images = extract_params.pop("extract_images", False)
115
+ extract_tables = extract_params.pop("extract_tables", False)
116
+ extract_charts = extract_params.pop("extract_charts", False)
117
+ extract_infographics = extract_params.pop("extract_infographics", False)
118
+ except KeyError as e:
119
+ raise ValueError(f"Missing required extraction flag: {e}")
120
+
121
+ # Inject configuration and trace info into extraction parameters.
122
+ if getattr(extraction_config, "docx_extraction_config", None) is not None:
123
+ extract_params["docx_extraction_config"] = extraction_config.docx_extraction_config
124
+
125
+ if execution_trace_log is not None:
126
+ extract_params["trace_info"] = execution_trace_log
127
+
128
+ # extraction_func: Callable = _get_extraction_function(extract_method, default)
129
+ extracted_data: Any = python_docx(
130
+ docx_stream=doc_stream,
131
+ extract_text=extract_text,
132
+ extract_images=extract_images,
133
+ extract_infographics=extract_infographics,
134
+ extract_tables=extract_tables,
135
+ extract_charts=extract_charts,
136
+ extraction_config=extract_params,
137
+ execution_trace_log=None,
138
+ )
139
+
140
+ return extracted_data
141
+
142
+
143
+ @unified_exception_handler
144
+ def extract_primitives_from_docx_internal(
145
+ df_extraction_ledger: pd.DataFrame,
146
+ task_config: Union[Dict[str, Any], BaseModel],
147
+ extraction_config: DocxExtractorSchema,
148
+ execution_trace_log: Optional[Dict[str, Any]] = None,
149
+ ) -> pd.DataFrame:
150
+ """
151
+ Processes a pandas DataFrame containing DOCX files encoded in base64, extracting text from
152
+ each document and replacing the original content with the extracted text.
153
+
154
+ This function applies a decoding and extraction routine to each row of the input DataFrame.
155
+ The routine is provided via the `decode_and_extract` function, which is partially applied with
156
+ task configuration, extraction configuration, and optional trace information. The results are
157
+ exploded and any missing values are dropped, then compiled into a new DataFrame with columns
158
+ for document type, metadata, and a UUID identifier.
159
+
160
+ Parameters
161
+ ----------
162
+ df_extraction_ledger : pd.DataFrame
163
+ The input DataFrame containing DOCX files in base64 encoding. Expected columns include
164
+ 'source_id' and 'content'.
165
+ task_config : Union[Dict[str, Any], BaseModel]
166
+ Configuration instructions for the document processing task. This can be provided as a
167
+ dictionary or a Pydantic model.
168
+ extraction_config : Any
169
+ A configuration object for document extraction that guides the extraction process.
170
+ execution_trace_log : Optional[Dict[str, Any]], default=None
171
+ An optional dictionary containing trace information for debugging or logging.
172
+
173
+ Returns
174
+ -------
175
+ pd.DataFrame
176
+ A DataFrame with the original DOCX content replaced by the extracted text. The resulting
177
+ DataFrame contains the columns "document_type", "metadata", and "uuid".
178
+
179
+ Raises
180
+ ------
181
+ Exception
182
+ If an error occurs during the document extraction process, the exception is logged and
183
+ re-raised.
184
+ """
185
+ # Create a partial function to decode and extract using the provided configurations.
186
+ _decode_and_extract = functools.partial(
187
+ _decode_and_extract_from_docx,
188
+ task_config=task_config,
189
+ extraction_config=extraction_config,
190
+ execution_trace_log=execution_trace_log,
191
+ )
192
+
193
+ # Apply the decode_and_extract function to each row in the DataFrame.
194
+ sr_extraction = df_extraction_ledger.apply(_decode_and_extract, axis=1)
195
+
196
+ # Explode any list results and drop missing values.
197
+ sr_extraction = sr_extraction.explode().dropna()
198
+
199
+ # Convert the extraction results to a DataFrame if available.
200
+ if not sr_extraction.empty:
201
+ extracted_df = pd.DataFrame(sr_extraction.to_list(), columns=["document_type", "metadata", "uuid"])
202
+ else:
203
+ extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
204
+
205
+ return extracted_df
@@ -0,0 +1,3 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,122 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ # Copyright (c) 2024, NVIDIA CORPORATION.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ # pylint: disable=too-many-locals
19
+
20
+
21
+ import logging
22
+ from typing import IO, Optional, List
23
+
24
+ from nv_ingest_api.internal.enums.common import AccessLevelEnum, DocumentTypeEnum
25
+ from nv_ingest_api.internal.enums.common import TextTypeEnum
26
+ from nv_ingest_api.internal.extract.docx.engines.docxreader_helpers.docxreader import DocxReader
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def python_docx(
32
+ *,
33
+ docx_stream: IO,
34
+ extract_text: bool,
35
+ extract_images: bool,
36
+ extract_infographics: bool,
37
+ extract_tables: bool,
38
+ extract_charts: bool,
39
+ extraction_config: dict,
40
+ execution_trace_log: Optional[List] = None,
41
+ ):
42
+ """
43
+ Helper function that use python-docx to extract text from a bytestream document
44
+
45
+ A document has three levels - document, paragraphs and runs. To align with the
46
+ pdf extraction paragraphs are aliased as block. python-docx leaves the page number
47
+ and line number to the renderer so we assume that the entire document is a single
48
+ page.
49
+
50
+ Run level parsing has been skipped but can be added as needed.
51
+
52
+ Parameters
53
+ ----------
54
+ docx_stream:
55
+ Bytestream
56
+ extract_text : bool
57
+ Specifies whether to extract text.
58
+ extract_images : bool
59
+ Specifies whether to extract images.
60
+ extract_infographics : bool
61
+ Specifies whether to extract infographics.
62
+ extract_tables : bool
63
+ Specifies whether to extract tables.
64
+ extract_charts : bool
65
+ Specifies whether to extract charts.
66
+ extraction_config : dict
67
+ A dictionary of configuration parameters for the extraction process.
68
+ execution_trace_log : list, optional
69
+ A list for accumulating trace information during extraction. Defaults to None.
70
+
71
+ Returns
72
+ -------
73
+ str
74
+ A string of extracted text.
75
+ """
76
+
77
+ _ = execution_trace_log
78
+ _ = extract_infographics
79
+
80
+ row_data = extraction_config.get("row_data")
81
+ # get source_id
82
+ source_id = row_data["source_id"]
83
+ # get text_depth
84
+ text_depth = extraction_config.get("text_depth", "document")
85
+ text_depth = TextTypeEnum(text_depth)
86
+ # get base metadata
87
+ metadata_col = "metadata"
88
+
89
+ docx_extractor_config = extraction_config.get("docx_extraction_config", {})
90
+
91
+ base_unified_metadata = row_data[metadata_col] if metadata_col in row_data.index else {}
92
+
93
+ # get base source_metadata
94
+ base_source_metadata = base_unified_metadata.get("source_metadata", {})
95
+ # get source_location
96
+ source_location = base_source_metadata.get("source_location", "")
97
+ # get collection_id (assuming coming in from source_metadata...)
98
+ collection_id = base_source_metadata.get("collection_id", "")
99
+ # get partition_id (assuming coming in from source_metadata...)
100
+ partition_id = base_source_metadata.get("partition_id", -1)
101
+ # get access_level (assuming coming in from source_metadata...)
102
+ access_level = base_source_metadata.get("access_level", AccessLevelEnum.UNKNOWN)
103
+
104
+ # python-docx doesn't maintain filename; re-use source_id
105
+ source_metadata = {
106
+ "source_name": source_id,
107
+ "source_id": source_id,
108
+ "source_location": source_location,
109
+ "source_type": DocumentTypeEnum.DOCX,
110
+ "collection_id": collection_id,
111
+ "partition_id": partition_id,
112
+ "access_level": access_level,
113
+ "summary": "",
114
+ }
115
+
116
+ # Extract data from the document using python-docx
117
+ doc = DocxReader(docx_stream, source_metadata, extraction_config=docx_extractor_config)
118
+ extracted_data = doc.extract_data(
119
+ base_unified_metadata, text_depth, extract_text, extract_charts, extract_tables, extract_images
120
+ )
121
+
122
+ return extracted_data