nv-ingest-api 2025.4.18.dev20250418__py3-none-any.whl → 2025.4.20.dev20250420__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (153) hide show
  1. nv_ingest_api/__init__.py +0 -3
  2. nv_ingest_api/{internal/primitives → primitives}/control_message_task.py +0 -4
  3. nv_ingest_api/{internal/primitives → primitives}/ingest_control_message.py +2 -5
  4. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/METADATA +1 -1
  5. nv_ingest_api-2025.4.20.dev20250420.dist-info/RECORD +9 -0
  6. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/WHEEL +1 -1
  7. nv_ingest_api/interface/__init__.py +0 -215
  8. nv_ingest_api/interface/extract.py +0 -972
  9. nv_ingest_api/interface/mutate.py +0 -154
  10. nv_ingest_api/interface/store.py +0 -218
  11. nv_ingest_api/interface/transform.py +0 -382
  12. nv_ingest_api/interface/utility.py +0 -200
  13. nv_ingest_api/internal/enums/__init__.py +0 -3
  14. nv_ingest_api/internal/enums/common.py +0 -494
  15. nv_ingest_api/internal/extract/__init__.py +0 -3
  16. nv_ingest_api/internal/extract/audio/__init__.py +0 -3
  17. nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -149
  18. nv_ingest_api/internal/extract/docx/__init__.py +0 -5
  19. nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -205
  20. nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  21. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -3
  22. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -122
  23. nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -895
  24. nv_ingest_api/internal/extract/image/__init__.py +0 -3
  25. nv_ingest_api/internal/extract/image/chart_extractor.py +0 -353
  26. nv_ingest_api/internal/extract/image/image_extractor.py +0 -204
  27. nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -3
  28. nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -403
  29. nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -253
  30. nv_ingest_api/internal/extract/image/table_extractor.py +0 -344
  31. nv_ingest_api/internal/extract/pdf/__init__.py +0 -3
  32. nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -19
  33. nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -484
  34. nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -243
  35. nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -597
  36. nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -146
  37. nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -603
  38. nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -96
  39. nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -426
  40. nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -74
  41. nv_ingest_api/internal/extract/pptx/__init__.py +0 -5
  42. nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  43. nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -799
  44. nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -187
  45. nv_ingest_api/internal/mutate/__init__.py +0 -3
  46. nv_ingest_api/internal/mutate/deduplicate.py +0 -110
  47. nv_ingest_api/internal/mutate/filter.py +0 -133
  48. nv_ingest_api/internal/primitives/__init__.py +0 -0
  49. nv_ingest_api/internal/primitives/nim/__init__.py +0 -8
  50. nv_ingest_api/internal/primitives/nim/default_values.py +0 -15
  51. nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -3
  52. nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -274
  53. nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -56
  54. nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -270
  55. nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -275
  56. nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -238
  57. nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -462
  58. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -367
  59. nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -132
  60. nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -152
  61. nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -1400
  62. nv_ingest_api/internal/primitives/nim/nim_client.py +0 -344
  63. nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -81
  64. nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  65. nv_ingest_api/internal/primitives/tracing/latency.py +0 -69
  66. nv_ingest_api/internal/primitives/tracing/logging.py +0 -96
  67. nv_ingest_api/internal/primitives/tracing/tagging.py +0 -197
  68. nv_ingest_api/internal/schemas/__init__.py +0 -3
  69. nv_ingest_api/internal/schemas/extract/__init__.py +0 -3
  70. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -130
  71. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -135
  72. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -124
  73. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -124
  74. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -128
  75. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -218
  76. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -124
  77. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -129
  78. nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -3
  79. nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -23
  80. nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -34
  81. nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -19
  82. nv_ingest_api/internal/schemas/meta/__init__.py +0 -3
  83. nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -11
  84. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +0 -237
  85. nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -221
  86. nv_ingest_api/internal/schemas/mutate/__init__.py +0 -3
  87. nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -16
  88. nv_ingest_api/internal/schemas/store/__init__.py +0 -3
  89. nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -28
  90. nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -30
  91. nv_ingest_api/internal/schemas/transform/__init__.py +0 -3
  92. nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -15
  93. nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -17
  94. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -25
  95. nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -22
  96. nv_ingest_api/internal/store/__init__.py +0 -3
  97. nv_ingest_api/internal/store/embed_text_upload.py +0 -236
  98. nv_ingest_api/internal/store/image_upload.py +0 -232
  99. nv_ingest_api/internal/transform/__init__.py +0 -3
  100. nv_ingest_api/internal/transform/caption_image.py +0 -205
  101. nv_ingest_api/internal/transform/embed_text.py +0 -496
  102. nv_ingest_api/internal/transform/split_text.py +0 -157
  103. nv_ingest_api/util/__init__.py +0 -0
  104. nv_ingest_api/util/control_message/__init__.py +0 -0
  105. nv_ingest_api/util/control_message/validators.py +0 -47
  106. nv_ingest_api/util/converters/__init__.py +0 -0
  107. nv_ingest_api/util/converters/bytetools.py +0 -78
  108. nv_ingest_api/util/converters/containers.py +0 -65
  109. nv_ingest_api/util/converters/datetools.py +0 -90
  110. nv_ingest_api/util/converters/dftools.py +0 -127
  111. nv_ingest_api/util/converters/formats.py +0 -64
  112. nv_ingest_api/util/converters/type_mappings.py +0 -27
  113. nv_ingest_api/util/detectors/__init__.py +0 -5
  114. nv_ingest_api/util/detectors/language.py +0 -38
  115. nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  116. nv_ingest_api/util/exception_handlers/converters.py +0 -72
  117. nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  118. nv_ingest_api/util/exception_handlers/detectors.py +0 -74
  119. nv_ingest_api/util/exception_handlers/pdf.py +0 -116
  120. nv_ingest_api/util/exception_handlers/schemas.py +0 -68
  121. nv_ingest_api/util/image_processing/__init__.py +0 -5
  122. nv_ingest_api/util/image_processing/clustering.py +0 -260
  123. nv_ingest_api/util/image_processing/processing.py +0 -179
  124. nv_ingest_api/util/image_processing/table_and_chart.py +0 -449
  125. nv_ingest_api/util/image_processing/transforms.py +0 -407
  126. nv_ingest_api/util/logging/__init__.py +0 -0
  127. nv_ingest_api/util/logging/configuration.py +0 -31
  128. nv_ingest_api/util/message_brokers/__init__.py +0 -3
  129. nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -9
  130. nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -465
  131. nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -71
  132. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -451
  133. nv_ingest_api/util/metadata/__init__.py +0 -5
  134. nv_ingest_api/util/metadata/aggregators.py +0 -469
  135. nv_ingest_api/util/multi_processing/__init__.py +0 -8
  136. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -194
  137. nv_ingest_api/util/nim/__init__.py +0 -56
  138. nv_ingest_api/util/pdf/__init__.py +0 -3
  139. nv_ingest_api/util/pdf/pdfium.py +0 -427
  140. nv_ingest_api/util/schema/__init__.py +0 -0
  141. nv_ingest_api/util/schema/schema_validator.py +0 -10
  142. nv_ingest_api/util/service_clients/__init__.py +0 -3
  143. nv_ingest_api/util/service_clients/client_base.py +0 -86
  144. nv_ingest_api/util/service_clients/kafka/__init__.py +0 -3
  145. nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  146. nv_ingest_api/util/service_clients/redis/redis_client.py +0 -823
  147. nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  148. nv_ingest_api/util/service_clients/rest/rest_client.py +0 -531
  149. nv_ingest_api/util/string_processing/__init__.py +0 -51
  150. nv_ingest_api-2025.4.18.dev20250418.dist-info/RECORD +0 -152
  151. /nv_ingest_api/{internal → primitives}/__init__.py +0 -0
  152. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/licenses/LICENSE +0 -0
  153. {nv_ingest_api-2025.4.18.dev20250418.dist-info → nv_ingest_api-2025.4.20.dev20250420.dist-info}/top_level.txt +0 -0
@@ -1,129 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- import logging
7
- from typing import Optional
8
- from typing import Tuple
9
-
10
- from pydantic import field_validator, model_validator, ConfigDict, BaseModel
11
-
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- class TableExtractorConfigSchema(BaseModel):
17
- """
18
- Configuration schema for the table extraction stage settings.
19
-
20
- Parameters
21
- ----------
22
- auth_token : Optional[str], default=None
23
- Authentication token required for secure services.
24
-
25
- paddle_endpoints : Tuple[Optional[str], Optional[str]], default=(None, None)
26
- A tuple containing the gRPC and HTTP services for the paddle endpoint.
27
- Either the gRPC or HTTP service can be empty, but not both.
28
-
29
- Methods
30
- -------
31
- validate_endpoints(values)
32
- Validates that at least one of the gRPC or HTTP services is provided for the yolox endpoint.
33
-
34
- Raises
35
- ------
36
- ValueError
37
- If both gRPC and HTTP services are empty for the yolox endpoint.
38
-
39
- Config
40
- ------
41
- extra : str
42
- Pydantic config option to forbid extra fields.
43
- """
44
-
45
- auth_token: Optional[str] = None
46
-
47
- yolox_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
48
- yolox_infer_protocol: str = ""
49
-
50
- paddle_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
51
- paddle_infer_protocol: str = ""
52
-
53
- nim_batch_size: int = 2
54
- workers_per_progress_engine: int = 5
55
-
56
- @model_validator(mode="before")
57
- @classmethod
58
- def validate_endpoints(cls, values):
59
- """
60
- Validates the gRPC and HTTP services for the yolox endpoint.
61
-
62
- Parameters
63
- ----------
64
- values : dict
65
- Dictionary containing the values of the attributes for the class.
66
-
67
- Returns
68
- -------
69
- dict
70
- The validated dictionary of values.
71
-
72
- Raises
73
- ------
74
- ValueError
75
- If both gRPC and HTTP services are empty for the yolox endpoint.
76
- """
77
-
78
- def clean_service(service):
79
- """Set service to None if it's an empty string or contains only spaces or quotes."""
80
- if service is None or not service.strip() or service.strip(" \"'") == "":
81
- return None
82
- return service
83
-
84
- for endpoint_name in ["yolox_endpoints", "paddle_endpoints"]:
85
- grpc_service, http_service = values.get(endpoint_name, (None, None))
86
- grpc_service = clean_service(grpc_service)
87
- http_service = clean_service(http_service)
88
-
89
- if not grpc_service and not http_service:
90
- raise ValueError(f"Both gRPC and HTTP services cannot be empty for {endpoint_name}.")
91
-
92
- values[endpoint_name] = (grpc_service, http_service)
93
-
94
- return values
95
-
96
- model_config = ConfigDict(extra="forbid")
97
-
98
-
99
- class TableExtractorSchema(BaseModel):
100
- """
101
- Configuration schema for the table extraction processing settings.
102
-
103
- Parameters
104
- ----------
105
- max_queue_size : int, default=1
106
- The maximum number of items allowed in the processing queue.
107
-
108
- n_workers : int, default=2
109
- The number of worker threads to use for processing.
110
-
111
- raise_on_failure : bool, default=False
112
- A flag indicating whether to raise an exception if a failure occurs during table extraction.
113
-
114
- stage_config : Optional[TableExtractorConfigSchema], default=None
115
- Configuration for the table extraction stage, including yolox service endpoints.
116
- """
117
-
118
- max_queue_size: int = 1
119
- n_workers: int = 2
120
- raise_on_failure: bool = False
121
-
122
- @field_validator("max_queue_size", "n_workers")
123
- def check_positive(cls, v, field):
124
- if v <= 0:
125
- raise ValueError(f"{field.field_name} must be greater than 10.")
126
- return v
127
-
128
- endpoint_config: Optional[TableExtractorConfigSchema] = None
129
- model_config = ConfigDict(extra="forbid")
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
@@ -1,23 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- from typing import Optional, Literal
7
-
8
- from pydantic import Field, BaseModel
9
- from typing_extensions import Annotated
10
-
11
-
12
- class MessageBrokerClientSchema(BaseModel):
13
- host: str = "redis"
14
- port: Annotated[int, Field(gt=0, lt=65536)] = 6379
15
-
16
- # Update this for new broker types
17
- client_type: Literal["redis", "simple"] = "redis" # Restrict to 'redis' or 'simple'
18
-
19
- broker_params: Optional[dict] = Field(default_factory=dict)
20
-
21
- connection_timeout: Optional[Annotated[int, Field(ge=0)]] = 300
22
- max_backoff: Optional[Annotated[int, Field(ge=0)]] = 300
23
- max_retries: Optional[Annotated[int, Field(ge=0)]] = 0
@@ -1,34 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- import logging
7
- from typing import Optional
8
-
9
- from pydantic import ConfigDict, BaseModel
10
- from pydantic import Field
11
-
12
- logger = logging.getLogger(__name__)
13
-
14
-
15
- # Define schemas for request validation
16
- class PushRequestSchema(BaseModel):
17
- command: str
18
- queue_name: str = Field(..., min_length=1)
19
- message: str = Field(..., min_length=1)
20
- timeout: Optional[float] = 100 # Optional timeout for blocking push
21
- model_config = ConfigDict(extra="forbid")
22
-
23
-
24
- class PopRequestSchema(BaseModel):
25
- command: str
26
- queue_name: str = Field(..., min_length=1)
27
- timeout: Optional[float] = 100 # Optional timeout for blocking pop
28
- model_config = ConfigDict(extra="forbid")
29
-
30
-
31
- class SizeRequestSchema(BaseModel):
32
- command: str
33
- queue_name: str = Field(..., min_length=1)
34
- model_config = ConfigDict(extra="forbid")
@@ -1,19 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- # NOTE: This code is duplicated from the ingest service:
6
- # src/nv_ingest_client/schemas/response_schema.py
7
- # Eventually we should move all client wrappers for the message broker into a shared library that both the ingest
8
- # service and the client can use.
9
-
10
- from typing import Optional, Union
11
- from pydantic import BaseModel
12
-
13
-
14
- class ResponseSchema(BaseModel):
15
- response_code: int
16
- response_reason: Optional[str] = "OK"
17
- response: Union[str, dict, None] = None
18
- trace_id: Optional[str] = None # Unique trace ID
19
- transaction_id: Optional[str] = None # Unique transaction ID
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
@@ -1,11 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- from pydantic import ConfigDict, BaseModel
7
-
8
-
9
- # Define a base class with extra fields forbidden
10
- class BaseModelNoExt(BaseModel):
11
- model_config = ConfigDict(extra="forbid")
@@ -1,237 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import logging
6
- from typing import Any, Dict, List, Optional, Union, Annotated
7
-
8
- from pydantic import Field, field_validator, model_validator
9
-
10
- from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
11
- from nv_ingest_api.internal.enums.common import ContentTypeEnum, TaskTypeEnum, DocumentTypeEnum
12
-
13
- # ------------------------------------------------------------------------------
14
- # Logging Configuration
15
- # ------------------------------------------------------------------------------
16
- logger = logging.getLogger(__name__)
17
-
18
-
19
- # ------------------------------------------------------------------------------
20
- # Schemas: Common and Task-Specific
21
- # ------------------------------------------------------------------------------
22
-
23
-
24
- # Tracing Options Schema
25
- class TracingOptionsSchema(BaseModelNoExt):
26
- trace: bool = False
27
- ts_send: int
28
- trace_id: Optional[str] = None
29
-
30
-
31
- # Ingest Task Schemas
32
-
33
-
34
- class IngestTaskSplitSchema(BaseModelNoExt):
35
- tokenizer: Optional[str] = None
36
- chunk_size: Annotated[int, Field(gt=0)] = 1024
37
- chunk_overlap: Annotated[int, Field(ge=0)] = 150
38
- params: dict
39
-
40
- @field_validator("chunk_overlap")
41
- def check_chunk_overlap(cls, v, values, **kwargs):
42
- if v is not None and "chunk_size" in values.data and v >= values.data["chunk_size"]:
43
- raise ValueError("chunk_overlap must be less than chunk_size")
44
- return v
45
-
46
-
47
- class IngestTaskExtractSchema(BaseModelNoExt):
48
- document_type: DocumentTypeEnum
49
- method: str
50
- params: dict
51
-
52
- @field_validator("document_type", mode="before")
53
- @classmethod
54
- def case_insensitive_document_type(cls, v):
55
- if isinstance(v, str):
56
- v = v.lower()
57
- try:
58
- return DocumentTypeEnum(v)
59
- except ValueError:
60
- raise ValueError(f"{v} is not a valid DocumentTypeEnum value")
61
-
62
-
63
- class IngestTaskStoreEmbedSchema(BaseModelNoExt):
64
- params: dict
65
-
66
-
67
- class IngestTaskStoreSchema(BaseModelNoExt):
68
- structured: bool = True
69
- images: bool = False
70
- method: str
71
- params: dict
72
-
73
-
74
- # Captioning: All fields are optional and override default parameters.
75
- class IngestTaskCaptionSchema(BaseModelNoExt):
76
- api_key: Optional[str] = None
77
- endpoint_url: Optional[str] = None
78
- prompt: Optional[str] = None
79
- model_name: Optional[str] = None
80
-
81
-
82
- class IngestTaskFilterParamsSchema(BaseModelNoExt):
83
- min_size: int = 128
84
- max_aspect_ratio: Union[float, int] = 5.0
85
- min_aspect_ratio: Union[float, int] = 0.2
86
- filter: bool = False
87
-
88
-
89
- class IngestTaskFilterSchema(BaseModelNoExt):
90
- # TODO: Ensure ContentTypeEnum is imported/defined as needed.
91
- content_type: ContentTypeEnum = ContentTypeEnum.IMAGE
92
- params: IngestTaskFilterParamsSchema = IngestTaskFilterParamsSchema()
93
-
94
-
95
- class IngestTaskDedupParams(BaseModelNoExt):
96
- filter: bool = False
97
-
98
-
99
- class IngestTaskDedupSchema(BaseModelNoExt):
100
- # TODO: Ensure ContentTypeEnum is imported/defined as needed.
101
- content_type: ContentTypeEnum = ContentTypeEnum.IMAGE
102
- params: IngestTaskDedupParams = IngestTaskDedupParams()
103
-
104
-
105
- class IngestTaskEmbedSchema(BaseModelNoExt):
106
- endpoint_url: Optional[str] = None
107
- model_name: Optional[str] = None
108
- api_key: Optional[str] = None
109
- filter_errors: bool = False
110
-
111
-
112
- class IngestTaskVdbUploadSchema(BaseModelNoExt):
113
- bulk_ingest: bool = False
114
- bulk_ingest_path: Optional[str] = None
115
- params: Optional[dict] = None
116
- filter_errors: bool = True
117
-
118
-
119
- class IngestTaskAudioExtraction(BaseModelNoExt):
120
- auth_token: Optional[str] = None
121
- grpc_endpoint: Optional[str] = None
122
- http_endpoint: Optional[str] = None
123
- infer_protocol: Optional[str] = None
124
- function_id: Optional[str] = None
125
- use_ssl: Optional[bool] = None
126
- ssl_cert: Optional[str] = None
127
-
128
-
129
- class IngestTaskTableExtraction(BaseModelNoExt):
130
- params: dict = Field(default_factory=dict)
131
-
132
-
133
- class IngestTaskChartExtraction(BaseModelNoExt):
134
- params: dict = Field(default_factory=dict)
135
-
136
-
137
- class IngestTaskInfographicExtraction(BaseModelNoExt):
138
- params: dict = Field(default_factory=dict)
139
-
140
-
141
- class IngestTaskSchema(BaseModelNoExt):
142
- type: TaskTypeEnum
143
- task_properties: Union[
144
- IngestTaskSplitSchema,
145
- IngestTaskExtractSchema,
146
- IngestTaskStoreEmbedSchema,
147
- IngestTaskStoreSchema,
148
- IngestTaskEmbedSchema,
149
- IngestTaskCaptionSchema,
150
- IngestTaskDedupSchema,
151
- IngestTaskFilterSchema,
152
- IngestTaskVdbUploadSchema,
153
- IngestTaskAudioExtraction,
154
- IngestTaskTableExtraction,
155
- IngestTaskChartExtraction,
156
- IngestTaskInfographicExtraction,
157
- ]
158
- raise_on_failure: bool = False
159
-
160
- @model_validator(mode="before")
161
- @classmethod
162
- def check_task_properties_type(cls, values):
163
- task_type, task_properties = values.get("type"), values.get("task_properties", {})
164
- if task_type and task_properties:
165
- expected_type = {
166
- TaskTypeEnum.CAPTION: IngestTaskCaptionSchema,
167
- TaskTypeEnum.DEDUP: IngestTaskDedupSchema,
168
- TaskTypeEnum.EMBED: IngestTaskEmbedSchema,
169
- TaskTypeEnum.EXTRACT: IngestTaskExtractSchema,
170
- TaskTypeEnum.FILTER: IngestTaskFilterSchema, # Extend mapping as necessary
171
- TaskTypeEnum.SPLIT: IngestTaskSplitSchema,
172
- TaskTypeEnum.STORE_EMBEDDING: IngestTaskStoreEmbedSchema,
173
- TaskTypeEnum.STORE: IngestTaskStoreSchema,
174
- TaskTypeEnum.VDB_UPLOAD: IngestTaskVdbUploadSchema,
175
- TaskTypeEnum.AUDIO_DATA_EXTRACT: IngestTaskAudioExtraction,
176
- TaskTypeEnum.TABLE_DATA_EXTRACT: IngestTaskTableExtraction,
177
- TaskTypeEnum.CHART_DATA_EXTRACT: IngestTaskChartExtraction,
178
- TaskTypeEnum.INFOGRAPHIC_DATA_EXTRACT: IngestTaskInfographicExtraction,
179
- }.get(
180
- task_type
181
- ) # Removed .upper()
182
-
183
- # Validate task_properties against the expected schema.
184
- validated_task_properties = expected_type(**task_properties)
185
- values["task_properties"] = validated_task_properties
186
- return values
187
-
188
- @field_validator("type", mode="before")
189
- @classmethod
190
- def case_insensitive_task_type(cls, v):
191
- if isinstance(v, str):
192
- v = v.lower()
193
- try:
194
- return TaskTypeEnum(v)
195
- except ValueError:
196
- raise ValueError(f"{v} is not a valid TaskTypeEnum value")
197
-
198
-
199
- # ------------------------------------------------------------------------------
200
- # Schemas: Job Schemas
201
- # ------------------------------------------------------------------------------
202
-
203
-
204
- class JobPayloadSchema(BaseModelNoExt):
205
- content: List[Union[str, bytes]]
206
- source_name: List[str]
207
- source_id: List[Union[str, int]]
208
- document_type: List[str]
209
-
210
-
211
- class IngestJobSchema(BaseModelNoExt):
212
- job_payload: JobPayloadSchema
213
- job_id: Union[str, int]
214
- tasks: List[IngestTaskSchema]
215
- tracing_options: Optional[TracingOptionsSchema] = None
216
-
217
-
218
- # ------------------------------------------------------------------------------
219
- # Utility Functions
220
- # ------------------------------------------------------------------------------
221
-
222
-
223
- def validate_ingest_job(job_data: Dict[str, Any]) -> IngestJobSchema:
224
- """
225
- Validates a dictionary representing an ingest_job using the IngestJobSchema.
226
-
227
- Parameters:
228
- - job_data: Dictionary representing an ingest job.
229
-
230
- Returns:
231
- - IngestJobSchema: The validated ingest job.
232
-
233
- Raises:
234
- - ValidationError: If the input data does not conform to the IngestJobSchema.
235
- """
236
-
237
- return IngestJobSchema(**job_data)
@@ -1,221 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
-
6
- import logging
7
- from datetime import datetime
8
- from typing import Any
9
- from typing import Dict
10
- from typing import List
11
- from typing import Optional
12
- from typing import Union
13
-
14
- from pydantic import field_validator, model_validator, Field
15
-
16
- from nv_ingest_api.internal.enums.common import (
17
- AccessLevelEnum,
18
- ContentTypeEnum,
19
- TextTypeEnum,
20
- LanguageEnum,
21
- TableFormatEnum,
22
- StatusEnum,
23
- DocumentTypeEnum,
24
- TaskTypeEnum,
25
- )
26
- from nv_ingest_api.internal.schemas.meta.base_model_noext import BaseModelNoExt
27
- from nv_ingest_api.util.converters import datetools
28
-
29
- logger = logging.getLogger(__name__)
30
-
31
-
32
- # Sub schemas
33
- class SourceMetadataSchema(BaseModelNoExt):
34
- """
35
- Schema for the knowledge base file from which content
36
- and metadata is extracted.
37
- """
38
-
39
- source_name: str
40
- source_id: str
41
- source_location: str = ""
42
- source_type: Union[DocumentTypeEnum, str]
43
- collection_id: str = ""
44
- date_created: str = datetime.now().isoformat()
45
- last_modified: str = datetime.now().isoformat()
46
- summary: str = ""
47
- partition_id: int = -1
48
- access_level: Union[AccessLevelEnum, int] = AccessLevelEnum.UNKNOWN
49
-
50
- @field_validator("date_created", "last_modified")
51
- @classmethod
52
- def validate_fields(cls, field_value):
53
- datetools.validate_iso8601(field_value)
54
- return field_value
55
-
56
-
57
- class NearbyObjectsSubSchema(BaseModelNoExt):
58
- """
59
- Schema to hold related extracted object.
60
- """
61
-
62
- content: List[str] = Field(default_factory=list)
63
- bbox: List[tuple] = Field(default_factory=list)
64
- type: List[str] = Field(default_factory=list)
65
-
66
-
67
- class NearbyObjectsSchema(BaseModelNoExt):
68
- """
69
- Schema to hold types of related extracted objects.
70
- """
71
-
72
- text: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
73
- images: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
74
- structured: NearbyObjectsSubSchema = NearbyObjectsSubSchema()
75
-
76
-
77
- class ContentHierarchySchema(BaseModelNoExt):
78
- """
79
- Schema for the extracted content hierarchy.
80
- """
81
-
82
- page_count: int = -1
83
- page: int = -1
84
- block: int = -1
85
- line: int = -1
86
- span: int = -1
87
- nearby_objects: NearbyObjectsSchema = NearbyObjectsSchema()
88
-
89
-
90
- class ContentMetadataSchema(BaseModelNoExt):
91
- """
92
- Data extracted from a source; generally Text or Image.
93
- """
94
-
95
- type: ContentTypeEnum
96
- description: str = ""
97
- page_number: int = -1
98
- hierarchy: ContentHierarchySchema = ContentHierarchySchema()
99
- subtype: Union[ContentTypeEnum, str] = ""
100
-
101
-
102
- class TextMetadataSchema(BaseModelNoExt):
103
- text_type: TextTypeEnum
104
- summary: str = ""
105
- keywords: Union[str, List[str], Dict] = ""
106
- language: LanguageEnum = "en" # default to Unknown? Maybe do some kind of heuristic check
107
- text_location: tuple = (0, 0, 0, 0)
108
- text_location_max_dimensions: tuple = (0, 0, 0, 0)
109
-
110
-
111
- class ImageMetadataSchema(BaseModelNoExt):
112
- image_type: Union[DocumentTypeEnum, str]
113
- structured_image_type: ContentTypeEnum = ContentTypeEnum.NONE
114
- caption: str = ""
115
- text: str = ""
116
- image_location: tuple = (0, 0, 0, 0)
117
- image_location_max_dimensions: tuple = (0, 0)
118
- uploaded_image_url: str = ""
119
- width: int = 0
120
- height: int = 0
121
-
122
- @field_validator("image_type")
123
- def validate_image_type(cls, v):
124
- if not isinstance(v, (DocumentTypeEnum, str)):
125
- raise ValueError("image_type must be a string or DocumentTypeEnum")
126
- return v
127
-
128
- @field_validator("width", "height")
129
- def clamp_non_negative(cls, v, field):
130
- if v < 0:
131
- logger.warning(f"{field.field_name} is negative; clamping to 0. Original value: {v}")
132
- return 0
133
- return v
134
-
135
-
136
- class TableMetadataSchema(BaseModelNoExt):
137
- caption: str = ""
138
- table_format: TableFormatEnum
139
- table_content: str = ""
140
- table_content_format: Union[TableFormatEnum, str] = ""
141
- table_location: tuple = (0, 0, 0, 0)
142
- table_location_max_dimensions: tuple = (0, 0)
143
- uploaded_image_uri: str = ""
144
-
145
-
146
- class ChartMetadataSchema(BaseModelNoExt):
147
- caption: str = ""
148
- table_format: TableFormatEnum
149
- table_content: str = ""
150
- table_content_format: Union[TableFormatEnum, str] = ""
151
- table_location: tuple = (0, 0, 0, 0)
152
- table_location_max_dimensions: tuple = (0, 0)
153
- uploaded_image_uri: str = ""
154
-
155
-
156
- class AudioMetadataSchema(BaseModelNoExt):
157
- audio_transcript: str = ""
158
- audio_type: str = ""
159
-
160
-
161
- # TODO consider deprecating this in favor of info msg...
162
- class ErrorMetadataSchema(BaseModelNoExt):
163
- task: TaskTypeEnum
164
- status: StatusEnum
165
- source_id: str = ""
166
- error_msg: str
167
-
168
-
169
- class InfoMessageMetadataSchema(BaseModelNoExt):
170
- task: TaskTypeEnum
171
- status: StatusEnum
172
- message: str
173
- filter: bool
174
-
175
-
176
- # Main metadata schema
177
- class MetadataSchema(BaseModelNoExt):
178
- content: str = ""
179
- content_url: str = ""
180
- embedding: Optional[List[float]] = None
181
- source_metadata: Optional[SourceMetadataSchema] = None
182
- content_metadata: Optional[ContentMetadataSchema] = None
183
- audio_metadata: Optional[AudioMetadataSchema] = None
184
- text_metadata: Optional[TextMetadataSchema] = None
185
- image_metadata: Optional[ImageMetadataSchema] = None
186
- table_metadata: Optional[TableMetadataSchema] = None
187
- chart_metadata: Optional[ChartMetadataSchema] = None
188
- error_metadata: Optional[ErrorMetadataSchema] = None
189
- info_message_metadata: Optional[InfoMessageMetadataSchema] = None
190
- debug_metadata: Optional[Dict[str, Any]] = None
191
- raise_on_failure: bool = False
192
-
193
- @model_validator(mode="before")
194
- @classmethod
195
- def check_metadata_type(cls, values):
196
- content_type = values.get("content_metadata", {}).get("type", None)
197
- if content_type != ContentTypeEnum.AUDIO:
198
- values["audio_metadata"] = None
199
- if content_type != ContentTypeEnum.IMAGE:
200
- values["image_metadata"] = None
201
- if content_type != ContentTypeEnum.TEXT:
202
- values["text_metadata"] = None
203
- if content_type != ContentTypeEnum.STRUCTURED:
204
- values["table_metadata"] = None
205
- return values
206
-
207
-
208
- def validate_metadata(metadata: Dict[str, Any]) -> MetadataSchema:
209
- """
210
- Validates the given metadata dictionary against the MetadataSchema.
211
-
212
- Parameters:
213
- - metadata: A dictionary representing metadata to be validated.
214
-
215
- Returns:
216
- - An instance of MetadataSchema if validation is successful.
217
-
218
- Raises:
219
- - ValidationError: If the metadata does not conform to the schema.
220
- """
221
- return MetadataSchema(**metadata)
@@ -1,3 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0