nv-ingest-api 2025.4.18.dev20250418__py3-none-any.whl → 2025.4.19.dev20250419__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.4.19.dev20250419.dist-info/RECORD +9 -0
  6. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/WHEEL +1 -1
  7. nv_ingest_api/interface/__init__.py +0 -215
  8. nv_ingest_api/interface/extract.py +0 -972
  9. nv_ingest_api/interface/mutate.py +0 -154
  10. nv_ingest_api/interface/store.py +0 -218
  11. nv_ingest_api/interface/transform.py +0 -382
  12. nv_ingest_api/interface/utility.py +0 -200
  13. nv_ingest_api/internal/enums/__init__.py +0 -3
  14. nv_ingest_api/internal/enums/common.py +0 -494
  15. nv_ingest_api/internal/extract/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  17. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  18. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  19. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  20. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  23. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  24. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  25. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  26. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  27. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  28. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  29. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  30. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  31. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  32. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  33. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  34. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  35. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  36. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  37. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  38. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  39. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  40. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  41. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  42. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  43. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  44. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  45. nv_ingest_api/internal/mutate/__init__.py +0 -3
  46. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  47. nv_ingest_api/internal/mutate/filter.py +0 -133
  48. nv_ingest_api/internal/primitives/__init__.py +0 -0
  49. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  50. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  51. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  52. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  53. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  54. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  55. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
  56. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  57. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
  58. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  59. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  60. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  61. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  62. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  63. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  64. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  65. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  66. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  67. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  68. nv_ingest_api/internal/schemas/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  70. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  71. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  72. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  74. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  75. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  76. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  77. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  78. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  79. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  80. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  81. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  82. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  83. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  84. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  85. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  86. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  87. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  88. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  89. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  90. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  91. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  92. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  93. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  94. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  95. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  96. nv_ingest_api/internal/store/__init__.py +0 -3
  97. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  98. nv_ingest_api/internal/store/image_upload.py +0 -232
  99. nv_ingest_api/internal/transform/__init__.py +0 -3
  100. nv_ingest_api/internal/transform/caption_image.py +0 -205
  101. nv_ingest_api/internal/transform/embed_text.py +0 -496
  102. nv_ingest_api/internal/transform/split_text.py +0 -157
  103. nv_ingest_api/util/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/__init__.py +0 -0
  105. nv_ingest_api/util/control_message/validators.py +0 -47
  106. nv_ingest_api/util/converters/__init__.py +0 -0
  107. nv_ingest_api/util/converters/bytetools.py +0 -78
  108. nv_ingest_api/util/converters/containers.py +0 -65
  109. nv_ingest_api/util/converters/datetools.py +0 -90
  110. nv_ingest_api/util/converters/dftools.py +0 -127
  111. nv_ingest_api/util/converters/formats.py +0 -64
  112. nv_ingest_api/util/converters/type_mappings.py +0 -27
  113. nv_ingest_api/util/detectors/__init__.py +0 -5
  114. nv_ingest_api/util/detectors/language.py +0 -38
  115. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  116. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  117. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  118. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  119. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  120. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  121. nv_ingest_api/util/image_processing/__init__.py +0 -5
  122. nv_ingest_api/util/image_processing/clustering.py +0 -260
  123. nv_ingest_api/util/image_processing/processing.py +0 -179
  124. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  125. nv_ingest_api/util/image_processing/transforms.py +0 -407
  126. nv_ingest_api/util/logging/__init__.py +0 -0
  127. nv_ingest_api/util/logging/configuration.py +0 -31
  128. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  129. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  130. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  131. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  132. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
  133. nv_ingest_api/util/metadata/__init__.py +0 -5
  134. nv_ingest_api/util/metadata/aggregators.py +0 -469
  135. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  136. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  137. nv_ingest_api/util/nim/__init__.py +0 -56
  138. nv_ingest_api/util/pdf/__init__.py +0 -3
  139. nv_ingest_api/util/pdf/pdfium.py +0 -427
  140. nv_ingest_api/util/schema/__init__.py +0 -0
  141. nv_ingest_api/util/schema/schema_validator.py +0 -10
  142. nv_ingest_api/util/service_clients/__init__.py +0 -3
  143. nv_ingest_api/util/service_clients/client_base.py +0 -86
  144. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  145. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  146. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
  147. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  148. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
  149. nv_ingest_api/util/string_processing/__init__.py +0 -51
  150. nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +0 -152
  151. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.19.dev20250419.dist-info}/top_level.txt +0 -0
@@ -1,197 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- import functools
7
- import inspect
8
- import string
9
- from datetime import datetime
10
-
11
-
12
- def traceable(trace_name=None):
13
- """
14
- A decorator that adds entry and exit trace timestamps to a IngestControlMessage's metadata
15
- based on the presence of a 'config::add_trace_tagging' flag.
16
-
17
- This decorator checks if the 'config::add_trace_tagging' flag is set to True in the
18
- message's metadata. If so, it records the entry and exit timestamps of the function
19
- execution, using either a provided custom trace name or the function's name by default.
20
-
21
- Parameters
22
- ----------
23
- trace_name : str, optional
24
- A custom name for the trace entries in the message metadata. If not provided, the
25
- function's name is used by default.
26
-
27
- Returns
28
- -------
29
- decorator_trace_tagging : Callable
30
- A wrapper function that decorates the target function to implement trace tagging.
31
-
32
- Notes
33
- -----
34
- The decorated function must accept a IngestControlMessage object as its first argument. The
35
- IngestControlMessage object must implement `has_metadata`, `get_metadata`, and `set_metadata`
36
- methods used by the decorator to check for the trace tagging flag and to add trace metadata.
37
-
38
- The trace metadata added by the decorator includes two entries:
39
- - 'trace::entry::<trace_name>': The monotonic timestamp marking the function's entry.
40
- - 'trace::exit::<trace_name>': The monotonic timestamp marking the function's exit.
41
-
42
- Example
43
- -------
44
- Applying the decorator without a custom trace name:
45
-
46
- >>> @traceable()
47
- ... def process_message(message):
48
- ... pass
49
-
50
- Applying the decorator with a custom trace name:
51
-
52
- >>> @traceable(custom_trace_name="CustomTraceName")
53
- ... def process_message(message):
54
- ... pass
55
-
56
- In both examples, `process_message` will have entry and exit timestamps added to the
57
- IngestControlMessage's metadata if 'config::add_trace_tagging' is True.
58
-
59
- """
60
-
61
- def decorator_trace_tagging(func):
62
- @functools.wraps(func)
63
- def wrapper_trace_tagging(*args, **kwargs):
64
- # Assuming the first argument is always the message
65
- ts_fetched = datetime.now()
66
- message = args[0]
67
-
68
- do_trace_tagging = (message.has_metadata("config::add_trace_tagging") is True) and (
69
- message.get_metadata("config::add_trace_tagging") is True
70
- )
71
-
72
- trace_prefix = trace_name if trace_name else func.__name__
73
-
74
- if do_trace_tagging:
75
- ts_send = message.get_timestamp("latency::ts_send")
76
- ts_entry = datetime.now()
77
- message.set_timestamp(f"trace::entry::{trace_prefix}", ts_entry)
78
- if ts_send:
79
- message.set_timestamp(f"trace::entry::{trace_prefix}_channel_in", ts_send)
80
- message.set_timestamp(f"trace::exit::{trace_prefix}_channel_in", ts_fetched)
81
-
82
- # Call the decorated function
83
- result = func(*args, **kwargs)
84
-
85
- if do_trace_tagging:
86
- ts_exit = datetime.now()
87
- message.set_timestamp(f"trace::exit::{trace_prefix}", ts_exit)
88
- message.set_timestamp("latency::ts_send", ts_exit)
89
-
90
- return result
91
-
92
- return wrapper_trace_tagging
93
-
94
- return decorator_trace_tagging
95
-
96
-
97
- def traceable_func(trace_name=None, dedupe=True):
98
- """
99
- A decorator that injects trace information for tracking the execution of a function.
100
- It logs the entry and exit timestamps of the function in a `trace_info` dictionary,
101
- which can be used for performance monitoring or debugging purposes.
102
-
103
- Parameters
104
- ----------
105
- trace_name : str, optional
106
- An optional string used as the prefix for the trace log entries. If not provided,
107
- the decorated function's name is used. The string can include placeholders (e.g.,
108
- "pdf_extractor::{model_name}") that will be dynamically replaced with matching
109
- function argument values.
110
- dedupe : bool, optional
111
- If True, ensures that the trace entry and exit keys are unique by appending an index
112
- (e.g., `_0`, `_1`) to the keys if duplicate entries are detected. Default is True.
113
-
114
- Returns
115
- -------
116
- function
117
- A wrapped function that injects trace information before and after the function's
118
- execution.
119
-
120
- Notes
121
- -----
122
- - If `trace_info` is not provided in the keyword arguments, a new dictionary is created
123
- and used for storing trace entries.
124
- - If `trace_name` contains format placeholders, the decorator attempts to populate them
125
- with matching argument values from the decorated function.
126
- - The trace information is logged in the format:
127
- - `trace::entry::{trace_name}` for the entry timestamp.
128
- - `trace::exit::{trace_name}` for the exit timestamp.
129
- - If `dedupe` is True, the trace keys will be appended with an index to avoid
130
- overwriting existing entries.
131
-
132
- Example
133
- -------
134
- >>> @traceable_func(trace_name="pdf_extractor::{model_name}")
135
- >>> def extract_pdf(model_name):
136
- ... pass
137
- >>> trace_info = {}
138
- >>> extract_pdf("my_model", trace_info=trace_info)
139
-
140
- In this example, `model_name` is dynamically replaced in the trace_name, and the
141
- trace information is logged with unique keys if deduplication is enabled.
142
- """
143
-
144
- def decorator_inject_trace_info(func):
145
- @functools.wraps(func)
146
- def wrapper_inject_trace_info(*args, **kwargs):
147
- trace_info = kwargs.pop("trace_info", None)
148
- if trace_info is None:
149
- trace_info = {}
150
- trace_prefix = trace_name if trace_name else func.__name__
151
-
152
- arg_names = list(inspect.signature(func).parameters)
153
- args_name_to_val = dict(zip(arg_names, args))
154
-
155
- # If `trace_name` is a formattable string, e.g., "pdf_extractor::{model_name}",
156
- # search `args` and `kwargs` to replace the placeholder.
157
- placeholders = [x[1] for x in string.Formatter().parse(trace_name) if x[1] is not None]
158
- if placeholders:
159
- format_kwargs = {}
160
- for name in placeholders:
161
- if name in args_name_to_val:
162
- arg_val = args_name_to_val[name]
163
- elif name in kwargs:
164
- arg_val = kwargs.get(name)
165
- else:
166
- arg_val = name
167
- format_kwargs[name] = arg_val
168
- trace_prefix = trace_prefix.format(**format_kwargs)
169
-
170
- trace_entry_key = f"trace::entry::{trace_prefix}"
171
- trace_exit_key = f"trace::exit::{trace_prefix}"
172
-
173
- ts_entry = datetime.now()
174
-
175
- if dedupe:
176
- trace_entry_key += "_{}"
177
- trace_exit_key += "_{}"
178
- i = 0
179
- while (trace_entry_key.format(i) in trace_info) or (trace_exit_key.format(i) in trace_info):
180
- i += 1
181
- trace_entry_key = trace_entry_key.format(i)
182
- trace_exit_key = trace_exit_key.format(i)
183
-
184
- trace_info[trace_entry_key] = ts_entry
185
-
186
- # Call the decorated function
187
- result = func(*args, **kwargs)
188
-
189
- ts_exit = datetime.now()
190
-
191
- trace_info[trace_exit_key] = ts_exit
192
-
193
- return result
194
-
195
- return wrapper_inject_trace_info
196
-
197
- return decorator_inject_trace_info
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
@@ -1,130 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- import logging
7
- from typing import Optional
8
- from typing import Tuple
9
-
10
- from pydantic import BaseModel
11
- from pydantic import root_validator
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- class AudioConfigSchema(BaseModel):
17
- """
18
- Configuration schema for audio extraction endpoints and options.
19
-
20
- Parameters
21
- ----------
22
- auth_token : Optional[str], default=None
23
- Authentication token required for secure services.
24
-
25
- audio_endpoints : Tuple[str, str]
26
- A tuple containing the gRPC and HTTP services for the audio_retriever endpoint.
27
- Either the gRPC or HTTP service can be empty, but not both.
28
-
29
- Methods
30
- -------
31
- validate_endpoints(values)
32
- Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
33
-
34
- Raises
35
- ------
36
- ValueError
37
- If both gRPC and HTTP services are empty for any endpoint.
38
-
39
- Config
40
- ------
41
- extra : str
42
- Pydantic config option to forbid extra fields.
43
- """
44
-
45
- auth_token: Optional[str] = None
46
- audio_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
47
- audio_infer_protocol: Optional[str] = None
48
- function_id: Optional[str] = None
49
- use_ssl: Optional[bool] = None
50
- ssl_cert: Optional[str] = None
51
-
52
- @root_validator(pre=True)
53
- def validate_endpoints(cls, values):
54
- """
55
- Validates the gRPC and HTTP services for all endpoints.
56
-
57
- Parameters
58
- ----------
59
- values : dict
60
- Dictionary containing the values of the attributes for the class.
61
-
62
- Returns
63
- -------
64
- dict
65
- The validated dictionary of values.
66
-
67
- Raises
68
- ------
69
- ValueError
70
- If both gRPC and HTTP services are empty for any endpoint.
71
- """
72
-
73
- def clean_service(service):
74
- """Set service to None if it's an empty string or contains only spaces or quotes."""
75
- if service is None or not service.strip() or service.strip(" \"'") == "":
76
- return None
77
- return service
78
-
79
- endpoint_name = "audio_endpoints"
80
- grpc_service, http_service = values.get(endpoint_name)
81
- grpc_service = clean_service(grpc_service)
82
- http_service = clean_service(http_service)
83
-
84
- if not grpc_service and not http_service:
85
- raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
86
-
87
- values[endpoint_name] = (grpc_service, http_service)
88
-
89
- protocol_name = "audio_infer_protocol"
90
- protocol_value = values.get(protocol_name)
91
-
92
- if not protocol_value:
93
- protocol_value = "http" if http_service else "grpc" if grpc_service else ""
94
-
95
- protocol_value = protocol_value.lower()
96
- values[protocol_name] = protocol_value
97
-
98
- return values
99
-
100
- class Config:
101
- extra = "forbid"
102
-
103
-
104
- class AudioExtractorSchema(BaseModel):
105
- """
106
- Configuration schema for the PDF extractor settings.
107
-
108
- Parameters
109
- ----------
110
- max_queue_size : int, default=1
111
- The maximum number of items allowed in the processing queue.
112
-
113
- n_workers : int, default=16
114
- The number of worker threads to use for processing.
115
-
116
- raise_on_failure : bool, default=False
117
- A flag indicating whether to raise an exception on processing failure.
118
-
119
- audio_extraction_config: Optional[AudioConfigSchema], default=None
120
- Configuration schema for the audio extraction stage.
121
- """
122
-
123
- max_queue_size: int = 1
124
- n_workers: int = 16
125
- raise_on_failure: bool = False
126
-
127
- audio_extraction_config: Optional[AudioConfigSchema] = None
128
-
129
- class Config:
130
- extra = "forbid"
@@ -1,135 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import logging
6
- from typing import Optional
7
- from typing import Tuple
8
-
9
- from pydantic import field_validator, model_validator, ConfigDict, BaseModel
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
-
14
- class ChartExtractorConfigSchema(BaseModel):
15
- """
16
- Configuration schema for chart extraction service endpoints and options.
17
-
18
- Parameters
19
- ----------
20
- auth_token : Optional[str], default=None
21
- Authentication token required for secure services.
22
-
23
- yolox_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
24
- A tuple containing the gRPC and HTTP services for the yolox endpoint.
25
- Either the gRPC or HTTP service can be empty, but not both.
26
-
27
- paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
28
- A tuple containing the gRPC and HTTP services for the paddle endpoint.
29
- Either the gRPC or HTTP service can be empty, but not both.
30
-
31
- Methods
32
- -------
33
- validate_endpoints(values)
34
- Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
35
-
36
- Raises
37
- ------
38
- ValueError
39
- If both gRPC and HTTP services are empty for any endpoint.
40
-
41
- Config
42
- ------
43
- extra : str
44
- Pydantic config option to forbid extra fields.
45
- """
46
-
47
- auth_token: Optional[str] = None
48
-
49
- yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
50
- yolox_infer_protocol: str = ""
51
-
52
- paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
53
- paddle_infer_protocol: str = ""
54
-
55
- nim_batch_size: int = 2
56
- workers_per_progress_engine: int = 5
57
-
58
- @model_validator(mode="before")
59
- @classmethod
60
- def validate_endpoints(cls, values):
61
- """
62
- Validates the gRPC and HTTP services for all endpoints.
63
-
64
- Ensures that at least one service (either gRPC or HTTP) is provided
65
- for each endpoint in the configuration.
66
-
67
- Parameters
68
- ----------
69
- values : dict
70
- Dictionary containing the values of the attributes for the class.
71
-
72
- Returns
73
- -------
74
- dict
75
- The validated dictionary of values.
76
-
77
- Raises
78
- ------
79
- ValueError
80
- If both gRPC and HTTP services are empty for any endpoint.
81
- """
82
-
83
- def clean_service(service):
84
- """Set service to None if it's an empty string or contains only spaces or quotes."""
85
- if service is None or not service.strip() or service.strip(" \"'") == "":
86
- return None
87
- return service
88
-
89
- for endpoint_name in ["yolox_endpoints", "paddle_endpoints"]:
90
- grpc_service, http_service = values.get(endpoint_name, (None, None))
91
- grpc_service = clean_service(grpc_service)
92
- http_service = clean_service(http_service)
93
-
94
- if not grpc_service and not http_service:
95
- raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
96
-
97
- values[endpoint_name] = (grpc_service, http_service)
98
-
99
- return values
100
-
101
- model_config = ConfigDict(extra="forbid")
102
-
103
-
104
- class ChartExtractorSchema(BaseModel):
105
- """
106
- Configuration schema for chart extraction processing settings.
107
-
108
- Parameters
109
- ----------
110
- max_queue_size : int, default=1
111
- The maximum number of items allowed in the processing queue.
112
-
113
- n_workers : int, default=2
114
- The number of worker threads to use for processing.
115
-
116
- raise_on_failure : bool, default=False
117
- A flag indicating whether to raise an exception if a failure occurs during chart extraction.
118
-
119
- extraction_config: Optional[ChartExtractorConfigSchema], default=None
120
- Configuration for the chart extraction stage, including yolox and paddle service endpoints.
121
- """
122
-
123
- max_queue_size: int = 1
124
- n_workers: int = 2
125
- raise_on_failure: bool = False
126
-
127
- endpoint_config: Optional[ChartExtractorConfigSchema] = None
128
-
129
- @field_validator("max_queue_size", "n_workers")
130
- def check_positive(cls, v, field):
131
- if v <= 0:
132
- raise ValueError(f"{field.field_name} must be greater than 10.")
133
- return v
134
-
135
- model_config = ConfigDict(extra="forbid")
@@ -1,124 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- import logging
7
- from typing import Optional
8
- from typing import Tuple
9
-
10
- from pydantic import model_validator, ConfigDict, BaseModel
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- class DocxConfigSchema(BaseModel):
16
- """
17
- Configuration schema for docx extraction endpoints and options.
18
-
19
- Parameters
20
- ----------
21
- auth_token : Optional[str], default=None
22
- Authentication token required for secure services.
23
-
24
- yolox_endpoints : Tuple[str, str]
25
- A tuple containing the gRPC and HTTP services for the yolox endpoint.
26
- Either the gRPC or HTTP service can be empty, but not both.
27
-
28
- Methods
29
- -------
30
- validate_endpoints(values)
31
- Validates that at least one of the gRPC or HTTP services is provided for each endpoint.
32
-
33
- Raises
34
- ------
35
- ValueError
36
- If both gRPC and HTTP services are empty for any endpoint.
37
-
38
- Config
39
- ------
40
- extra : str
41
- Pydantic config option to forbid extra fields.
42
- """
43
-
44
- auth_token: Optional[str] = None
45
-
46
- yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
47
- yolox_infer_protocol: str = ""
48
-
49
- @model_validator(mode="before")
50
- @classmethod
51
- def validate_endpoints(cls, values):
52
- """
53
- Validates the gRPC and HTTP services for all endpoints.
54
-
55
- Parameters
56
- ----------
57
- values : dict
58
- Dictionary containing the values of the attributes for the class.
59
-
60
- Returns
61
- -------
62
- dict
63
- The validated dictionary of values.
64
-
65
- Raises
66
- ------
67
- ValueError
68
- If both gRPC and HTTP services are empty for any endpoint.
69
- """
70
-
71
- def clean_service(service):
72
- """Set service to None if it's an empty string or contains only spaces or quotes."""
73
- if service is None or not service.strip() or service.strip(" \"'") == "":
74
- return None
75
- return service
76
-
77
- for model_name in ["yolox"]:
78
- endpoint_name = f"{model_name}_endpoints"
79
- grpc_service, http_service = values.get(endpoint_name)
80
- grpc_service = clean_service(grpc_service)
81
- http_service = clean_service(http_service)
82
-
83
- if not grpc_service and not http_service:
84
- raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
85
-
86
- values[endpoint_name] = (grpc_service, http_service)
87
-
88
- protocol_name = f"{model_name}_infer_protocol"
89
- protocol_value = values.get(protocol_name)
90
- if not protocol_value:
91
- protocol_value = "http" if http_service else "grpc" if grpc_service else ""
92
- protocol_value = protocol_value.lower()
93
- values[protocol_name] = protocol_value
94
-
95
- return values
96
-
97
- model_config = ConfigDict(extra="forbid")
98
-
99
-
100
- class DocxExtractorSchema(BaseModel):
101
- """
102
- Configuration schema for the PDF extractor settings.
103
-
104
- Parameters
105
- ----------
106
- max_queue_size : int, default=1
107
- The maximum number of items allowed in the processing queue.
108
-
109
- n_workers : int, default=16
110
- The number of worker threads to use for processing.
111
-
112
- raise_on_failure : bool, default=False
113
- A flag indicating whether to raise an exception on processing failure.
114
-
115
- image_extraction_config: Optional[ImageConfigSchema], default=None
116
- Configuration schema for the image extraction stage.
117
- """
118
-
119
- max_queue_size: int = 1
120
- n_workers: int = 16
121
- raise_on_failure: bool = False
122
-
123
- docx_extraction_config: Optional[DocxConfigSchema] = None
124
- model_config = ConfigDict(extra="forbid")