nv-ingest-client 2025.8.14.dev20250814__py3-none-any.whl → 2025.8.16.dev20250816__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-client might be problematic. Click here for more details.

Files changed (28) hide show
  1. nv_ingest_client/cli/util/click.py +182 -30
  2. nv_ingest_client/client/interface.py +209 -26
  3. nv_ingest_client/nv_ingest_cli.py +16 -0
  4. nv_ingest_client/primitives/jobs/job_spec.py +29 -9
  5. nv_ingest_client/primitives/tasks/__init__.py +6 -4
  6. nv_ingest_client/primitives/tasks/audio_extraction.py +27 -23
  7. nv_ingest_client/primitives/tasks/caption.py +10 -16
  8. nv_ingest_client/primitives/tasks/chart_extraction.py +16 -10
  9. nv_ingest_client/primitives/tasks/dedup.py +12 -21
  10. nv_ingest_client/primitives/tasks/embed.py +21 -76
  11. nv_ingest_client/primitives/tasks/extract.py +67 -168
  12. nv_ingest_client/primitives/tasks/filter.py +21 -27
  13. nv_ingest_client/primitives/tasks/infographic_extraction.py +16 -13
  14. nv_ingest_client/primitives/tasks/split.py +17 -18
  15. nv_ingest_client/primitives/tasks/store.py +29 -29
  16. nv_ingest_client/primitives/tasks/task_base.py +1 -72
  17. nv_ingest_client/primitives/tasks/task_factory.py +2 -0
  18. nv_ingest_client/primitives/tasks/udf.py +352 -0
  19. nv_ingest_client/util/vdb/milvus.py +1 -0
  20. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/METADATA +1 -1
  21. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/RECORD +25 -27
  22. nv_ingest_client/cli/util/tasks.py +0 -3
  23. nv_ingest_client/primitives/exceptions.py +0 -0
  24. nv_ingest_client/primitives/tasks/transform.py +0 -0
  25. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/WHEEL +0 -0
  26. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/entry_points.txt +0 -0
  27. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/licenses/LICENSE +0 -0
  28. {nv_ingest_client-2025.8.14.dev20250814.dist-info → nv_ingest_client-2025.8.16.dev20250816.dist-info}/top_level.txt +0 -0
@@ -150,23 +150,48 @@ class JobSpec:
150
150
 
151
151
  def add_task(self, task) -> None:
152
152
  """
153
- Adds a task to the job specification.
153
+ Adds a task or list of tasks to the job specification.
154
154
 
155
155
  Parameters
156
156
  ----------
157
- task
158
- The task to add to the job specification. Assumes the task has a to_dict method.
157
+ task : Task or list of Task
158
+ The task(s) to add to the job specification. Can be a single task or a list of tasks.
159
+ Each task must derive from the Task class and have a to_dict method.
159
160
 
160
161
  Raises
161
162
  ------
162
163
  ValueError
163
- If the task does not have a to_dict method.
164
+ If any task does not derive from the Task class.
165
+ """
166
+ # Handle both single tasks and lists of tasks
167
+ if isinstance(task, list):
168
+ # Process each task in the list
169
+ for single_task in task:
170
+ self._add_single_task(single_task)
171
+ else:
172
+ # Process single task
173
+ self._add_single_task(task)
174
+
175
+ def _add_single_task(self, task) -> None:
176
+ """
177
+ Adds a single task to the job specification with automatic task expansion.
178
+
179
+ Parameters
180
+ ----------
181
+ task : Task
182
+ The task to add to the job specification.
183
+
184
+ Raises
185
+ ------
186
+ ValueError
187
+ If the task does not derive from the Task class.
164
188
  """
165
189
  if not isinstance(task, Task):
166
190
  raise ValueError("Task must derive from nv_ingest_client.primitives.Task class")
167
191
 
168
192
  self._tasks.append(task)
169
193
 
194
+ # Automatic task expansion for ExtractTask
170
195
  if isinstance(task, ExtractTask) and (task._extract_tables is True):
171
196
  self._tasks.append(TableExtractionTask())
172
197
  if isinstance(task, ExtractTask) and (task._extract_charts is True):
@@ -322,11 +347,6 @@ class BatchJobSpec:
322
347
  document_type : str, optional
323
348
  The document type used to filter job specifications. If not provided, the
324
349
  `document_type` is inferred from the task, or the task is applied to all job specifications.
325
-
326
- Raises
327
- ------
328
- ValueError
329
- If the task does not derive from the `Task` class.
330
350
  """
331
351
  if not isinstance(task, Task):
332
352
  raise ValueError("Task must derive from nv_ingest_client.primitives.Task class")
@@ -18,14 +18,18 @@ from .task_base import Task
18
18
  from .task_base import TaskType
19
19
  from .task_base import is_valid_task_type
20
20
  from .task_factory import task_factory
21
+ from .udf import UDFTask
21
22
 
22
23
  __all__ = [
23
24
  "AudioExtractionTask",
24
25
  "CaptionTask",
25
26
  "ChartExtractionTask",
27
+ "DedupTask",
28
+ "EmbedTask",
26
29
  "ExtractTask",
27
- "is_valid_task_type",
30
+ "FilterTask",
28
31
  "InfographicExtractionTask",
32
+ "is_valid_task_type",
29
33
  "SplitTask",
30
34
  "StoreEmbedTask",
31
35
  "StoreTask",
@@ -33,7 +37,5 @@ __all__ = [
33
37
  "Task",
34
38
  "task_factory",
35
39
  "TaskType",
36
- "DedupTask",
37
- "FilterTask",
38
- "EmbedTask",
40
+ "UDFTask",
39
41
  ]
@@ -10,33 +10,19 @@ import logging
10
10
  from typing import Dict
11
11
  from typing import Optional
12
12
 
13
- from pydantic import BaseModel
14
- from pydantic import ConfigDict
13
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskAudioExtraction
15
14
 
16
15
  from .task_base import Task
17
16
 
18
17
  logger = logging.getLogger(__name__)
19
18
 
20
19
 
21
- class AudioExtractionSchema(BaseModel):
22
- auth_token: Optional[str] = None
23
- grpc_endpoint: Optional[str] = None
24
- http_endpoint: Optional[str] = None
25
- infer_protocol: Optional[str] = None
26
- function_id: Optional[str] = None
27
- use_ssl: Optional[bool] = None
28
- ssl_cert: Optional[str] = None
29
- segment_audio: Optional[bool] = None
30
-
31
- model_config = ConfigDict(extra="forbid")
32
- model_config["protected_namespaces"] = ()
33
-
34
-
35
20
  class AudioExtractionTask(Task):
36
21
  def __init__(
37
22
  self,
38
23
  auth_token: str = None,
39
24
  grpc_endpoint: str = None,
25
+ http_endpoint: str = None,
40
26
  infer_protocol: str = None,
41
27
  function_id: Optional[str] = None,
42
28
  use_ssl: bool = None,
@@ -45,13 +31,26 @@ class AudioExtractionTask(Task):
45
31
  ) -> None:
46
32
  super().__init__()
47
33
 
48
- self._auth_token = auth_token
49
- self._grpc_endpoint = grpc_endpoint
50
- self._infer_protocol = infer_protocol
51
- self._function_id = function_id
52
- self._use_ssl = use_ssl
53
- self._ssl_cert = ssl_cert
54
- self._segment_audio = segment_audio
34
+ # Use the API schema for validation
35
+ validated_data = IngestTaskAudioExtraction(
36
+ auth_token=auth_token,
37
+ grpc_endpoint=grpc_endpoint,
38
+ http_endpoint=http_endpoint,
39
+ infer_protocol=infer_protocol,
40
+ function_id=function_id,
41
+ use_ssl=use_ssl,
42
+ ssl_cert=ssl_cert,
43
+ segment_audio=segment_audio,
44
+ )
45
+
46
+ self._auth_token = validated_data.auth_token
47
+ self._grpc_endpoint = validated_data.grpc_endpoint
48
+ self._http_endpoint = validated_data.http_endpoint
49
+ self._infer_protocol = validated_data.infer_protocol
50
+ self._function_id = validated_data.function_id
51
+ self._use_ssl = validated_data.use_ssl
52
+ self._ssl_cert = validated_data.ssl_cert
53
+ self._segment_audio = validated_data.segment_audio
55
54
 
56
55
  def __str__(self) -> str:
57
56
  """
@@ -64,6 +63,8 @@ class AudioExtractionTask(Task):
64
63
  info += " auth_token: [redacted]\n"
65
64
  if self._grpc_endpoint:
66
65
  info += f" grpc_endpoint: {self._grpc_endpoint}\n"
66
+ if self._http_endpoint:
67
+ info += f" http_endpoint: {self._http_endpoint}\n"
67
68
  if self._infer_protocol:
68
69
  info += f" infer_protocol: {self._infer_protocol}\n"
69
70
  if self._function_id:
@@ -89,6 +90,9 @@ class AudioExtractionTask(Task):
89
90
  if self._grpc_endpoint:
90
91
  task_properties["grpc_endpoint"] = self._grpc_endpoint
91
92
 
93
+ if self._http_endpoint:
94
+ task_properties["http_endpoint"] = self._http_endpoint
95
+
92
96
  if self._infer_protocol:
93
97
  task_properties["infer_protocol"] = self._infer_protocol
94
98
 
@@ -8,25 +8,14 @@
8
8
 
9
9
  import logging
10
10
  from typing import Dict
11
- from typing import Optional
12
11
 
13
- from pydantic import ConfigDict, BaseModel
12
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskCaptionSchema
14
13
 
15
14
  from .task_base import Task
16
15
 
17
16
  logger = logging.getLogger(__name__)
18
17
 
19
18
 
20
- class CaptionTaskSchema(BaseModel):
21
- api_key: Optional[str] = None
22
- endpoint_url: Optional[str] = None
23
- prompt: Optional[str] = None
24
- model_name: Optional[str] = None
25
-
26
- model_config = ConfigDict(extra="forbid")
27
- model_config["protected_namespaces"] = ()
28
-
29
-
30
19
  class CaptionTask(Task):
31
20
  def __init__(
32
21
  self,
@@ -37,10 +26,15 @@ class CaptionTask(Task):
37
26
  ) -> None:
38
27
  super().__init__()
39
28
 
40
- self._api_key = api_key
41
- self._endpoint_url = endpoint_url
42
- self._prompt = prompt
43
- self._model_name = model_name
29
+ # Use the API schema for validation
30
+ validated_data = IngestTaskCaptionSchema(
31
+ api_key=api_key, endpoint_url=endpoint_url, prompt=prompt, model_name=model_name
32
+ )
33
+
34
+ self._api_key = validated_data.api_key
35
+ self._endpoint_url = validated_data.endpoint_url
36
+ self._prompt = validated_data.prompt
37
+ self._model_name = validated_data.model_name
44
38
 
45
39
  def __str__(self) -> str:
46
40
  """
@@ -9,35 +9,41 @@
9
9
  import logging
10
10
  from typing import Dict
11
11
 
12
- from pydantic import BaseModel
12
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskChartExtraction
13
13
 
14
14
  from .task_base import Task
15
15
 
16
16
  logger = logging.getLogger(__name__)
17
17
 
18
18
 
19
- class ChartExtractionSchema(BaseModel):
20
- class Config:
21
- extra = "forbid"
22
-
23
-
24
19
  class ChartExtractionTask(Task):
25
20
  """
26
21
  Object for chart extraction task
27
22
  """
28
23
 
29
- def __init__(self) -> None:
24
+ def __init__(self, params: dict = None) -> None:
30
25
  """
31
- Setup Dedup Task Config
26
+ Setup Chart Extraction Task Config
32
27
  """
33
28
  super().__init__()
34
29
 
30
+ # Handle None params by converting to empty dict for backward compatibility
31
+ if params is None:
32
+ params = {}
33
+
34
+ # Use the API schema for validation
35
+ validated_data = IngestTaskChartExtraction(params=params)
36
+
37
+ self._params = validated_data.params
38
+
35
39
  def __str__(self) -> str:
36
40
  """
37
41
  Returns a string with the object's config and run time state
38
42
  """
39
43
  info = ""
40
- info += "chart extraction task\n"
44
+ info += "Chart Extraction Task:\n"
45
+ if self._params:
46
+ info += f" params: {self._params}\n"
41
47
  return info
42
48
 
43
49
  def to_dict(self) -> Dict:
@@ -46,7 +52,7 @@ class ChartExtractionTask(Task):
46
52
  """
47
53
 
48
54
  task_properties = {
49
- "params": {},
55
+ "params": self._params,
50
56
  }
51
57
 
52
58
  return {"type": "chart_data_extract", "task_properties": task_properties}
@@ -10,29 +10,13 @@ import logging
10
10
  from typing import Dict
11
11
  from typing import Literal
12
12
 
13
- from pydantic import BaseModel, field_validator
14
-
13
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskDedupSchema
15
14
 
16
15
  from .task_base import Task
17
16
 
18
17
  logger = logging.getLogger(__name__)
19
18
 
20
19
 
21
- class DedupTaskSchema(BaseModel):
22
- content_type: str = "image"
23
- filter: bool = False
24
-
25
- @field_validator("content_type")
26
- def content_type_must_be_valid(cls, v):
27
- valid_criteria = ["image"]
28
- if v not in valid_criteria:
29
- raise ValueError(f"content_type must be one of {valid_criteria}")
30
- return v
31
-
32
- class Config:
33
- extra = "forbid"
34
-
35
-
36
20
  class DedupTask(Task):
37
21
  """
38
22
  Object for document dedup task
@@ -49,8 +33,15 @@ class DedupTask(Task):
49
33
  Setup Dedup Task Config
50
34
  """
51
35
  super().__init__()
52
- self._content_type = content_type
53
- self._filter = filter
36
+
37
+ # Use the API schema for validation
38
+ validated_data = IngestTaskDedupSchema(
39
+ content_type=content_type,
40
+ params={"filter": filter},
41
+ )
42
+
43
+ self._content_type = validated_data.content_type
44
+ self._filter = validated_data.params.filter
54
45
 
55
46
  def __str__(self) -> str:
56
47
  """
@@ -58,7 +49,7 @@ class DedupTask(Task):
58
49
  """
59
50
  info = ""
60
51
  info += "Dedup Task:\n"
61
- info += f" content_type: {self._content_type}\n"
52
+ info += f" content_type: {self._content_type.value}\n"
62
53
  info += f" filter: {self._filter}\n"
63
54
  return info
64
55
 
@@ -69,7 +60,7 @@ class DedupTask(Task):
69
60
  dedup_params = {"filter": self._filter}
70
61
 
71
62
  task_properties = {
72
- "content_type": self._content_type,
63
+ "content_type": self._content_type.value,
73
64
  "params": dedup_params,
74
65
  }
75
66
 
@@ -7,82 +7,15 @@
7
7
  import logging
8
8
  from typing import Any
9
9
  from typing import Dict
10
- from typing import Literal
11
10
  from typing import Optional
12
- from typing import Type
13
11
 
14
- from pydantic import BaseModel
15
- from pydantic import ConfigDict
16
- from pydantic import model_validator
12
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import IngestTaskEmbedSchema
17
13
 
18
14
  from .task_base import Task
19
15
 
20
16
  logger = logging.getLogger(__name__)
21
17
 
22
18
 
23
- class EmbedTaskSchema(BaseModel):
24
- """
25
- Schema for embed task configuration.
26
-
27
- This schema contains configuration details for an embedding task,
28
- including the endpoint URL, model name, API key, and error filtering flag.
29
-
30
- Attributes
31
- ----------
32
- endpoint_url : Optional[str]
33
- URL of the embedding endpoint. Default is None.
34
- model_name : Optional[str]
35
- Name of the embedding model. Default is None.
36
- api_key : Optional[str]
37
- API key for authentication with the embedding service. Default is None.
38
- filter_errors : bool
39
- Flag to indicate whether errors should be filtered. Default is False.
40
- """
41
-
42
- endpoint_url: Optional[str] = None
43
- model_name: Optional[str] = None
44
- api_key: Optional[str] = None
45
- filter_errors: bool = False
46
-
47
- text_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
48
- image_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
49
- structured_elements_modality: Optional[Literal["text", "image", "text_image"]] = None
50
- audio_elements_modality: Optional[Literal["text"]] = None
51
-
52
- @model_validator(mode="before")
53
- def handle_deprecated_fields(cls: Type["EmbedTaskSchema"], values: Dict[str, Any]) -> Dict[str, Any]:
54
- """
55
- Handle deprecated fields before model validation.
56
-
57
- This validator checks for the presence of deprecated keys ('text' and 'tables')
58
- in the input dictionary and removes them. Warnings are issued if these keys are found.
59
-
60
- Parameters
61
- ----------
62
- values : Dict[str, Any]
63
- Input dictionary of model values.
64
-
65
- Returns
66
- -------
67
- Dict[str, Any]
68
- The updated dictionary with deprecated fields removed.
69
- """
70
- if "text" in values:
71
- logger.warning(
72
- "'text' parameter is deprecated and will be ignored. Future versions will remove this argument."
73
- )
74
- values.pop("text")
75
- if "tables" in values:
76
- logger.warning(
77
- "'tables' parameter is deprecated and will be ignored. Future versions will remove this argument."
78
- )
79
- values.pop("tables")
80
- return values
81
-
82
- model_config = ConfigDict(extra="forbid")
83
- model_config["protected_namespaces"] = ()
84
-
85
-
86
19
  class EmbedTask(Task):
87
20
  """
88
21
  Object for document embedding tasks.
@@ -133,14 +66,26 @@ class EmbedTask(Task):
133
66
  "'tables' parameter is deprecated and will be ignored. Future versions will remove this argument."
134
67
  )
135
68
 
136
- self._endpoint_url: Optional[str] = endpoint_url
137
- self._model_name: Optional[str] = model_name
138
- self._api_key: Optional[str] = api_key
139
- self._filter_errors: bool = filter_errors
140
- self._text_elements_modality: Optional[bool] = text_elements_modality
141
- self._image_elements_modality: Optional[bool] = image_elements_modality
142
- self._structured_elements_modality: Optional[bool] = structured_elements_modality
143
- self._audio_elements_modality: Optional[bool] = audio_elements_modality
69
+ # Use the API schema for validation
70
+ validated_data = IngestTaskEmbedSchema(
71
+ endpoint_url=endpoint_url,
72
+ model_name=model_name,
73
+ api_key=api_key,
74
+ filter_errors=filter_errors,
75
+ text_elements_modality=text_elements_modality,
76
+ image_elements_modality=image_elements_modality,
77
+ structured_elements_modality=structured_elements_modality,
78
+ audio_elements_modality=audio_elements_modality,
79
+ )
80
+
81
+ self._endpoint_url = validated_data.endpoint_url
82
+ self._model_name = validated_data.model_name
83
+ self._api_key = validated_data.api_key
84
+ self._filter_errors = validated_data.filter_errors
85
+ self._text_elements_modality = validated_data.text_elements_modality
86
+ self._image_elements_modality = validated_data.image_elements_modality
87
+ self._structured_elements_modality = validated_data.structured_elements_modality
88
+ self._audio_elements_modality = validated_data.audio_elements_modality
144
89
 
145
90
  def __str__(self) -> str:
146
91
  """