unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +57 -13
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -23
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/connector.py +5 -7
  34. unstructured_ingest/v2/interfaces/downloader.py +8 -5
  35. unstructured_ingest/v2/interfaces/file_data.py +8 -2
  36. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  37. unstructured_ingest/v2/interfaces/processor.py +10 -10
  38. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  39. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  40. unstructured_ingest/v2/pipeline/pipeline.py +9 -6
  41. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  42. unstructured_ingest/v2/pipeline/steps/download.py +13 -11
  43. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  44. unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
  45. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  46. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  47. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  48. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  49. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  50. unstructured_ingest/v2/processes/__init__.py +18 -0
  51. unstructured_ingest/v2/processes/chunker.py +74 -28
  52. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  53. unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
  54. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
  55. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
  56. unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
  57. unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
  58. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
  59. unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
  60. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
  61. unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
  62. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
  63. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
  64. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
  66. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
  67. unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
  68. unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
  69. unstructured_ingest/v2/processes/connectors/local.py +27 -16
  70. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  71. unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
  72. unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
  73. unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
  74. unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
  75. unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
  76. unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
  77. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  78. unstructured_ingest/v2/processes/connectors/sql.py +29 -31
  79. unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
  80. unstructured_ingest/v2/processes/embedder.py +106 -47
  81. unstructured_ingest/v2/processes/filter.py +11 -5
  82. unstructured_ingest/v2/processes/partitioner.py +79 -33
  83. unstructured_ingest/v2/processes/uncompress.py +3 -3
  84. unstructured_ingest/v2/utils.py +45 -0
  85. unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
  86. unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
  87. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
  88. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
  89. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  90. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  91. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  92. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  93. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  94. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  95. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  96. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  97. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
  99. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  100. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  101. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  102. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  103. unstructured_ingest/v2/cli/cmds/local.py +0 -52
  104. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  105. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  106. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  107. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  108. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  109. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  110. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  111. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  112. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  113. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  114. unstructured_ingest/v2/cli/configs/__init__.py +0 -13
  115. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  116. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  117. unstructured_ingest/v2/cli/configs/filter.py +0 -28
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,12 @@
1
1
  import json
2
2
  import os.path
3
- import sys
4
- from dataclasses import fields, is_dataclass
5
3
  from gettext import gettext, ngettext
6
4
  from gettext import gettext as _
7
5
  from pathlib import Path
8
- from typing import Any, ForwardRef, Optional, Type, TypeVar, Union, get_args, get_origin
6
+ from typing import Any, Optional, Type, TypeVar
9
7
 
10
8
  import click
11
-
12
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
13
- from unstructured_ingest.v2.logger import logger
9
+ from pydantic import BaseModel, ConfigDict, Secret
14
10
 
15
11
 
16
12
  def conform_click_options(options: dict):
@@ -30,7 +26,13 @@ class Dict(click.ParamType):
30
26
  ctx: Optional[click.Context] = None,
31
27
  ) -> Any:
32
28
  try:
33
- return json.loads(value)
29
+ if isinstance(value, dict):
30
+ return value
31
+ if isinstance(value, Path) and value.is_file():
32
+ with value.open() as f:
33
+ return json.load(f)
34
+ if isinstance(value, str):
35
+ return json.loads(value)
34
36
  except json.JSONDecodeError:
35
37
  self.fail(
36
38
  gettext(
@@ -107,86 +109,33 @@ class DelimitedString(click.ParamType):
107
109
  return split
108
110
 
109
111
 
110
- EnhancedDataClassJsonMixinT = TypeVar(
111
- "EnhancedDataClassJsonMixinT", bound=EnhancedDataClassJsonMixin
112
- )
113
-
114
-
115
- def extract_config(
116
- flat_data: dict, config: Type[EnhancedDataClassJsonMixinT]
117
- ) -> EnhancedDataClassJsonMixinT:
118
- """
119
- To be able to extract a nested dataclass from a flat dictionary (as in one coming
120
- from a click-based options input), the config class is dynamically looked through for
121
- nested dataclass fields and new nested dictionaries are created to conform to the
122
- shape the overall class expects when parsing from a dict. During the process, this will create
123
- copies of the original dictionary to avoid pruning fields but this isn't a
124
- problem since the `from_dict()` method ignores unneeded values.
125
-
126
- Not handling more complex edge cases for now such as nested types i.e Union[List[List[...]]]
127
- """
128
-
129
- def conform_dict(inner_d: dict, inner_config: Type[EnhancedDataClassJsonMixinT]):
130
- # Catch edge cases (i.e. Dict[str, ...]) where underlying type is not a concrete Class,
131
- # causing 'issubclass() arg 1 must be a class' errors, return False
132
- def is_subclass(instance, class_type) -> bool:
133
- try:
134
- return issubclass(instance, class_type)
135
- except Exception:
136
- return False
137
-
138
- dd = inner_d.copy()
139
- for field in fields(inner_config):
140
- f_type = field.type
141
- # typing can be defined using a string, in which case it needs to be resolved
142
- # to the actual type. following logic is cherry picked from the typing
143
- # get_type_hints() since type resolution can be expensive, only do it
144
- # when the type is a string
145
- if isinstance(f_type, str):
146
- try:
147
- base_globals = sys.modules[inner_config.__module__].__dict__
148
- for_ref = ForwardRef(f_type, is_argument=False, is_class=True)
149
- f_type = for_ref._evaluate(
150
- globalns=base_globals, localns=None, recursive_guard=frozenset()
151
- )
152
- except NameError as e:
153
- logger.warning(f"couldn't resolve type {f_type}: {e}")
154
- # Handle the case where the type of a value if a Union (possibly optional)
155
- if get_origin(f_type) is Union:
156
- union_values = get_args(f_type)
157
- # handle List types
158
- union_values = [
159
- get_args(u)[0] if get_origin(u) is list else u for u in union_values
160
- ]
161
- # Ignore injected NoneType when optional
162
- concrete_union_values = [v for v in union_values if not is_subclass(v, type(None))]
163
- dataclass_union_values = [v for v in concrete_union_values if is_dataclass(v)]
164
- non_dataclass_union_values = [
165
- v for v in concrete_union_values if not is_dataclass(v)
166
- ]
167
- if not dataclass_union_values:
168
- continue
169
- # Check if the key for this field already exists in the dictionary,
170
- # if so it might map to one of these non dataclass fields and this
171
- # can't be enforced
172
- if non_dataclass_union_values and field.name in dd:
173
- continue
174
- if len(dataclass_union_values) > 1:
175
- logger.warning(
176
- "more than one dataclass type possible for field {}, "
177
- "not extracting: {}".format(field.name, ", ".join(dataclass_union_values))
178
- )
179
- continue
180
- f_type = dataclass_union_values[0]
181
- origin = get_origin(f_type)
182
- if origin:
183
- f_type = origin
184
- if is_subclass(f_type, EnhancedDataClassJsonMixin):
185
- dd[field.name] = conform_dict(inner_d=dd, inner_config=f_type)
186
- return dd
187
-
188
- adjusted_dict = conform_dict(inner_d=flat_data, inner_config=config)
189
- return config.from_dict(adjusted_dict, apply_name_overload=False)
112
+ BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
113
+
114
+
115
+ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
116
+ fields = config.model_fields
117
+ config.model_config = ConfigDict(extra="ignore")
118
+ field_names = [v.alias or k for k, v in fields.items()]
119
+ data = {k: v for k, v in flat_data.items() if k in field_names and v is not None}
120
+ if access_config := fields.get("access_config"):
121
+ access_config_type = access_config.annotation
122
+ # Check if raw type is wrapped by a secret
123
+ if (
124
+ hasattr(access_config_type, "__origin__")
125
+ and hasattr(access_config_type, "__args__")
126
+ and access_config_type.__origin__ is Secret
127
+ ):
128
+ ac_subtypes = access_config_type.__args__
129
+ ac_fields = ac_subtypes[0].model_fields
130
+ elif issubclass(access_config_type, BaseModel):
131
+ ac_fields = access_config_type.model_fields
132
+ else:
133
+ raise TypeError(f"Unrecognized access_config type: {access_config_type}")
134
+ ac_field_names = [v.alias or k for k, v in ac_fields.items()]
135
+ data["access_config"] = {
136
+ k: v for k, v in flat_data.items() if k in ac_field_names and v is not None
137
+ }
138
+ return config.model_validate(obj=data)
190
139
 
191
140
 
192
141
  class Group(click.Group):
@@ -195,13 +144,11 @@ class Group(click.Group):
195
144
  This allows for subcommands to be called with the --help flag without breaking
196
145
  if parent command is missing any of its required parameters
197
146
  """
198
-
199
147
  try:
200
148
  return super().parse_args(ctx, args)
201
149
  except click.MissingParameter:
202
150
  if "--help" not in args:
203
151
  raise
204
-
205
152
  # remove the required params so that help can display
206
153
  for param in self.params:
207
154
  param.required = False
@@ -0,0 +1,199 @@
1
+ import contextlib
2
+ import datetime
3
+ from collections import Counter
4
+ from enum import EnumMeta
5
+ from pathlib import Path
6
+ from typing import Any, Callable, Literal, Optional, Type, TypedDict, Union, get_args, get_origin
7
+ from uuid import UUID
8
+
9
+ import click
10
+ from annotated_types import Ge, Gt, Le, Lt, SupportsGe, SupportsGt, SupportsLe, SupportsLt
11
+ from click import Option
12
+ from pydantic import BaseModel, Secret, SecretStr
13
+ from pydantic.fields import FieldInfo
14
+ from pydantic.types import _SecretBase
15
+ from pydantic_core import PydanticUndefined
16
+
17
+ from unstructured_ingest.v2.cli.utils.click import DelimitedString, Dict
18
+
19
+ NoneType = type(None)
20
+
21
+
22
+ class _RangeDict(TypedDict, total=False):
23
+ """Represent arguments to `click.IntRange` or `click.FloatRange`."""
24
+
25
+ max: Union[SupportsLt, SupportsLe]
26
+ min: Union[SupportsGt, SupportsGe]
27
+ max_open: bool
28
+ min_open: bool
29
+
30
+
31
+ def get_range_from_metadata(metadata: list[Any]) -> _RangeDict:
32
+ range_args: _RangeDict = {}
33
+ for constraint in metadata:
34
+ if isinstance(constraint, Le):
35
+ range_args["max"] = constraint.le
36
+ range_args["max_open"] = False
37
+ if isinstance(constraint, Lt):
38
+ range_args["max"] = constraint.lt
39
+ range_args["max_open"] = True
40
+ if isinstance(constraint, Ge):
41
+ range_args["min"] = constraint.ge
42
+ range_args["min_open"] = False
43
+ if isinstance(constraint, Gt):
44
+ range_args["min"] = constraint.gt
45
+ range_args["min_open"] = True
46
+ return range_args
47
+
48
+
49
+ def is_boolean_flag(field_info: FieldInfo) -> bool:
50
+ annotation = field_info.annotation
51
+ raw_annotation = get_raw_type(annotation)
52
+ return raw_annotation is bool
53
+
54
+
55
+ def get_raw_type(val: Any) -> Any:
56
+ field_args = get_args(val)
57
+ field_origin = get_origin(val)
58
+ if field_origin is Union and len(field_args) == 2 and NoneType in field_args:
59
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
60
+ return field_type
61
+ if field_origin is Secret and len(field_args) == 1:
62
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
63
+ return field_type
64
+ if val is SecretStr:
65
+ return str
66
+ return val
67
+
68
+
69
+ def get_default_value_from_field(field: FieldInfo) -> Optional[Union[Any, Callable[[], Any]]]:
70
+ if field.default is not PydanticUndefined:
71
+ return field.default
72
+ elif field.default_factory is not None:
73
+ return field.default_factory
74
+ return None
75
+
76
+
77
+ def get_option_name(field_name: str, field_info: FieldInfo) -> str:
78
+ field_name = field_info.alias or field_name
79
+ if field_name.startswith("--"):
80
+ field_name = field_name[2:]
81
+ field_name = field_name.lower().replace("_", "-")
82
+ if is_boolean_flag(field_info):
83
+ return f"--{field_name}/--no-{field_name}"
84
+ return f"--{field_name}"
85
+
86
+
87
+ def get_numerical_type(field: FieldInfo) -> click.ParamType:
88
+ range_args = get_range_from_metadata(field.metadata)
89
+ if field.annotation is int:
90
+ if range_args:
91
+ return click.IntRange(**range_args) # type: ignore[arg-type]
92
+ return click.INT
93
+ # Non-integer numerical types default to float
94
+ if range_args:
95
+ return click.FloatRange(**range_args) # type: ignore[arg-type]
96
+ return click.FLOAT
97
+
98
+
99
+ def get_type_from_annotation(field_type: Any) -> click.ParamType:
100
+ field_origin = get_origin(field_type)
101
+ field_args = get_args(field_type)
102
+ if field_origin is Union and len(field_args) == 2 and NoneType in field_args:
103
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
104
+ return get_type_from_annotation(field_type=field_type)
105
+ if field_origin is Secret and len(field_args) == 1:
106
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
107
+ return get_type_from_annotation(field_type=field_type)
108
+ if field_origin is list and len(field_args) == 1 and field_args[0] is str:
109
+ return DelimitedString()
110
+ if field_type is SecretStr:
111
+ return click.STRING
112
+ if dict in [field_type, field_origin]:
113
+ return Dict()
114
+ if field_type is str:
115
+ return click.STRING
116
+ if field_type is bool:
117
+ return click.BOOL
118
+ if field_type is UUID:
119
+ return click.UUID
120
+ if field_type is Path:
121
+ return click.Path(path_type=Path)
122
+ if field_type in (datetime.datetime, datetime.date):
123
+ return click.DateTime()
124
+ if field_origin is Literal:
125
+ return click.Choice(field_args)
126
+ if isinstance(field_type, EnumMeta):
127
+ values = [i.value for i in field_type]
128
+ return click.Choice(values)
129
+ raise TypeError(f"Unexpected field type: {field_type}")
130
+
131
+
132
+ def _get_type_from_field(field: FieldInfo) -> click.ParamType:
133
+ raw_field_type = get_raw_type(field.annotation)
134
+
135
+ if raw_field_type in (int, float):
136
+ return get_numerical_type(field)
137
+ return get_type_from_annotation(field_type=field.annotation)
138
+
139
+
140
+ def get_option_from_field(option_name: str, field_info: FieldInfo) -> Option:
141
+ param_decls = [option_name]
142
+ help = field_info.description or ""
143
+ if examples := field_info.examples:
144
+ help += f" [Examples: {', '.join(examples)}]"
145
+ option_kwargs = {
146
+ "type": _get_type_from_field(field_info),
147
+ "default": get_default_value_from_field(field_info),
148
+ "required": field_info.is_required(),
149
+ "help": help,
150
+ "is_flag": is_boolean_flag(field_info),
151
+ "show_default": field_info.default is not PydanticUndefined,
152
+ }
153
+ return click.Option(param_decls=param_decls, **option_kwargs)
154
+
155
+
156
+ def is_subclass(x: Any, y: Any) -> bool:
157
+ with contextlib.suppress(TypeError):
158
+ return issubclass(x, y)
159
+
160
+ return False
161
+
162
+
163
+ def post_check(options: list[Option]):
164
+ option_names = [option.name for option in options]
165
+ duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
166
+ if duplicate_names:
167
+ raise ValueError(
168
+ "the following field name were reused, all must be unique: {}".format(
169
+ ", ".join(duplicate_names)
170
+ )
171
+ )
172
+
173
+
174
+ def is_secret(value: Any) -> bool:
175
+ # Case Secret[int]
176
+ if hasattr(value, "__origin__") and hasattr(value, "__args__"):
177
+ origin = value.__origin__
178
+ return is_subclass(origin, _SecretBase)
179
+ # Case SecretStr
180
+ return is_subclass(value, _SecretBase)
181
+
182
+
183
+ def options_from_base_model(model: Union[BaseModel, Type[BaseModel]]) -> list[Option]:
184
+ options = []
185
+ model_fields = model.model_fields
186
+ for field_name, field_info in model_fields.items():
187
+ if field_info.init is False:
188
+ continue
189
+ option_name = get_option_name(field_name=field_name, field_info=field_info)
190
+ raw_annotation = get_raw_type(field_info.annotation)
191
+ if is_subclass(raw_annotation, BaseModel):
192
+ options.extend(options_from_base_model(model=raw_annotation))
193
+ else:
194
+ if is_secret(field_info.annotation):
195
+ field_info.description = f"[sensitive] {field_info.description}"
196
+ options.append(get_option_from_field(option_name=option_name, field_info=field_info))
197
+
198
+ post_check(options=options)
199
+ return options
@@ -2,11 +2,10 @@ from abc import ABC
2
2
  from dataclasses import dataclass
3
3
  from typing import Any, TypeVar
4
4
 
5
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
5
+ from pydantic import BaseModel, Secret
6
6
 
7
7
 
8
- @dataclass
9
- class AccessConfig(EnhancedDataClassJsonMixin):
8
+ class AccessConfig(BaseModel):
10
9
  """Meant to designate holding any sensitive information associated with other configs
11
10
  and also for access specific configs."""
12
11
 
@@ -14,14 +13,13 @@ class AccessConfig(EnhancedDataClassJsonMixin):
14
13
  AccessConfigT = TypeVar("AccessConfigT", bound=AccessConfig)
15
14
 
16
15
 
17
- @dataclass
18
- class ConnectionConfig(EnhancedDataClassJsonMixin):
19
- access_config: AccessConfigT
16
+ class ConnectionConfig(BaseModel):
17
+ access_config: Secret[AccessConfigT]
20
18
 
21
19
  def get_access_config(self) -> dict[str, Any]:
22
20
  if not self.access_config:
23
21
  return {}
24
- return self.access_config.to_dict(apply_name_overload=False)
22
+ return self.access_config.get_secret_value().dict()
25
23
 
26
24
 
27
25
  ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
@@ -1,18 +1,21 @@
1
1
  import os
2
2
  from abc import ABC, abstractmethod
3
- from dataclasses import dataclass
4
3
  from pathlib import Path
5
4
  from typing import Any, Optional, TypedDict, TypeVar, Union
6
5
 
7
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+ from pydantic import BaseModel, Field
7
+
8
8
  from unstructured_ingest.v2.interfaces.connector import BaseConnector
9
9
  from unstructured_ingest.v2.interfaces.file_data import FileData
10
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
11
11
 
12
12
 
13
- @dataclass
14
- class DownloaderConfig(EnhancedDataClassJsonMixin):
15
- download_dir: Optional[Path] = None
13
+ class DownloaderConfig(BaseModel):
14
+ download_dir: Optional[Path] = Field(
15
+ default=None,
16
+ description="Where files are downloaded to, defaults to a location at"
17
+ "`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
18
+ )
16
19
 
17
20
 
18
21
  DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
@@ -4,7 +4,6 @@ from pathlib import Path
4
4
  from typing import Any, Literal, Optional
5
5
 
6
6
  from dataclasses_json import DataClassJsonMixin
7
- from unstructured.documents.elements import DataSourceMetadata
8
7
 
9
8
 
10
9
  @dataclass
@@ -23,7 +22,14 @@ class SourceIdentifiers:
23
22
 
24
23
 
25
24
  @dataclass
26
- class FileDataSourceMetadata(DataSourceMetadata):
25
+ class FileDataSourceMetadata(DataClassJsonMixin):
26
+ url: Optional[str] = None
27
+ version: Optional[str] = None
28
+ record_locator: Optional[dict[str, Any]] = None
29
+ date_created: Optional[str] = None
30
+ date_modified: Optional[str] = None
31
+ date_processed: Optional[str] = None
32
+ permissions_data: Optional[list[dict[str, Any]]] = None
27
33
  filesize_bytes: Optional[int] = None
28
34
 
29
35
 
@@ -1,15 +1,14 @@
1
1
  from abc import ABC, abstractmethod
2
- from dataclasses import dataclass
3
2
  from typing import Any, Generator, Optional, TypeVar
4
3
 
5
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
4
+ from pydantic import BaseModel
5
+
6
6
  from unstructured_ingest.v2.interfaces.connector import BaseConnector
7
7
  from unstructured_ingest.v2.interfaces.file_data import FileData
8
8
  from unstructured_ingest.v2.interfaces.process import BaseProcess
9
9
 
10
10
 
11
- @dataclass
12
- class IndexerConfig(EnhancedDataClassJsonMixin):
11
+ class IndexerConfig(BaseModel):
13
12
  pass
14
13
 
15
14
 
@@ -1,24 +1,24 @@
1
1
  import os
2
2
  from asyncio import Semaphore
3
- from dataclasses import dataclass, field
4
3
  from pathlib import Path
5
- from typing import Optional
4
+ from typing import Any, Optional
6
5
 
7
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+ from pydantic import BaseModel, ConfigDict, Field
8
7
 
9
8
  DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
10
9
 
11
10
 
12
- @dataclass
13
- class ProcessorConfig(EnhancedDataClassJsonMixin):
11
+ class ProcessorConfig(BaseModel):
12
+ model_config = ConfigDict(arbitrary_types_allowed=True)
13
+
14
14
  reprocess: bool = False
15
15
  verbose: bool = False
16
16
  tqdm: bool = False
17
- work_dir: str = field(default_factory=lambda: DEFAULT_WORK_DIR)
17
+ work_dir: str = Field(default_factory=lambda: DEFAULT_WORK_DIR)
18
18
  num_processes: int = 2
19
19
  max_connections: Optional[int] = None
20
20
  raise_on_error: bool = False
21
- disable_parallelism: bool = field(
21
+ disable_parallelism: bool = Field(
22
22
  default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
23
23
  )
24
24
  preserve_downloads: bool = False
@@ -28,10 +28,10 @@ class ProcessorConfig(EnhancedDataClassJsonMixin):
28
28
  uncompress: bool = False
29
29
 
30
30
  # Used to keep track of state in pipeline
31
- status: dict = field(default_factory=dict)
32
- semaphore: Optional[Semaphore] = field(init=False, default=None)
31
+ status: dict = Field(default_factory=dict)
32
+ semaphore: Optional[Semaphore] = Field(init=False, default=None)
33
33
 
34
- def __post_init__(self):
34
+ def model_post_init(self, __context: Any) -> None:
35
35
  if self.max_connections is not None:
36
36
  self.semaphore = Semaphore(self.max_connections)
37
37
 
@@ -3,13 +3,13 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Any, TypeVar
5
5
 
6
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+ from pydantic import BaseModel
7
+
7
8
  from unstructured_ingest.v2.interfaces.file_data import FileData
8
9
  from unstructured_ingest.v2.interfaces.process import BaseProcess
9
10
 
10
11
 
11
- @dataclass
12
- class UploadStagerConfig(EnhancedDataClassJsonMixin):
12
+ class UploadStagerConfig(BaseModel):
13
13
  pass
14
14
 
15
15
 
@@ -3,14 +3,14 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Any, TypeVar
5
5
 
6
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+ from pydantic import BaseModel
7
+
7
8
  from unstructured_ingest.v2.interfaces.connector import BaseConnector
8
9
  from unstructured_ingest.v2.interfaces.file_data import FileData
9
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
10
11
 
11
12
 
12
- @dataclass
13
- class UploaderConfig(EnhancedDataClassJsonMixin):
13
+ class UploaderConfig(BaseModel):
14
14
  pass
15
15
 
16
16
 
@@ -15,7 +15,6 @@ from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, Partiti
15
15
  from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
16
16
  from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
17
17
  from unstructured_ingest.v2.pipeline.steps.upload import Uploader, UploadStep
18
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
19
18
  from unstructured_ingest.v2.processes.chunker import ChunkerConfig
20
19
  from unstructured_ingest.v2.processes.connector_registry import (
21
20
  ConnectionConfig,
@@ -178,10 +177,7 @@ class Pipeline:
178
177
  return filtered_records
179
178
 
180
179
  def _run(self):
181
- logger.info(
182
- f"Running local pipline: {self} with configs: "
183
- f"{sterilize_dict(self.context.to_dict(redact_sensitive=True))}"
184
- )
180
+ logger.info(f"Running local pipline: {self} with configs: " f"{self.context.json()}")
185
181
  if self.context.mp_supported:
186
182
  manager = mp.Manager()
187
183
  self.context.status = manager.dict()
@@ -192,22 +188,26 @@ class Pipeline:
192
188
  indices = self.indexer_step.run()
193
189
  indices_inputs = [{"file_data_path": i} for i in indices]
194
190
  if not indices_inputs:
191
+ logger.info("No files to process after indexer, exiting")
195
192
  return
196
193
 
197
194
  # Initial filtering on indexed content
198
195
  indices_inputs = self.apply_filter(records=indices_inputs)
199
196
  if not indices_inputs:
197
+ logger.info("No files to process after filtering indexed content, exiting")
200
198
  return
201
199
 
202
200
  # Download associated content to local file system
203
201
  downloaded_data = self.downloader_step(indices_inputs)
204
202
  downloaded_data = self.clean_results(results=downloaded_data)
205
203
  if not downloaded_data:
204
+ logger.info("No files to process after downloader, exiting")
206
205
  return
207
206
 
208
207
  # Post download filtering
209
208
  downloaded_data = self.apply_filter(records=downloaded_data)
210
209
  if not downloaded_data:
210
+ logger.info("No files to process after filtering downloaded content, exiting")
211
211
  return
212
212
 
213
213
  # Run uncompress if available
@@ -219,6 +219,7 @@ class Pipeline:
219
219
  # Post uncompress filtering
220
220
  downloaded_data = self.apply_filter(records=downloaded_data)
221
221
  if not downloaded_data:
222
+ logger.info("No files to process after filtering uncompressed content, exiting")
222
223
  return
223
224
 
224
225
  if not downloaded_data:
@@ -228,6 +229,7 @@ class Pipeline:
228
229
  elements = self.partitioner_step(downloaded_data)
229
230
  elements = self.clean_results(results=elements)
230
231
  if not elements:
232
+ logger.info("No files to process after partitioning, exiting")
231
233
  return
232
234
 
233
235
  # Run element specific modifiers
@@ -235,6 +237,7 @@ class Pipeline:
235
237
  elements = step(elements) if step else elements
236
238
  elements = self.clean_results(results=elements)
237
239
  if not elements:
240
+ logger.info(f"No files to process after {step.__class__.__name__}, exiting")
238
241
  return
239
242
 
240
243
  # Upload the final result
@@ -337,7 +340,7 @@ class Pipeline:
337
340
  )
338
341
  if len(destination_entry) != 1:
339
342
  raise ValueError(
340
- "no entry found in source registry with matching uploader, "
343
+ "no entry found in destination registry with matching uploader, "
341
344
  "stager and connection configs"
342
345
  )
343
346
 
@@ -5,13 +5,11 @@ from dataclasses import dataclass
5
5
  from pathlib import Path
6
6
  from typing import Callable, Optional, TypedDict
7
7
 
8
- from unstructured.staging.base import elements_to_dicts
9
-
10
8
  from unstructured_ingest.v2.interfaces import FileData
11
9
  from unstructured_ingest.v2.logger import logger
12
10
  from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
13
- from unstructured_ingest.v2.pipeline.utils import sterilize_dict
14
11
  from unstructured_ingest.v2.processes.chunker import Chunker
12
+ from unstructured_ingest.v2.utils import serialize_base_model_json
15
13
 
16
14
  STEP_ID = "chunk"
17
15
 
@@ -30,11 +28,7 @@ class ChunkStep(PipelineStep):
30
28
  return f"{self.identifier} ({self.process.config.chunking_strategy})"
31
29
 
32
30
  def __post_init__(self):
33
- config = (
34
- sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
35
- if self.process.config
36
- else None
37
- )
31
+ config = self.process.config.json() if self.process.config else None
38
32
  logger.info(f"Created {self.identifier} with configs: {config}")
39
33
 
40
34
  def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
@@ -72,13 +66,13 @@ class ChunkStep(PipelineStep):
72
66
  chunked_content_raw = await fn(**fn_kwargs)
73
67
  self._save_output(
74
68
  output_filepath=str(output_filepath),
75
- chunked_content=elements_to_dicts(chunked_content_raw),
69
+ chunked_content=chunked_content_raw,
76
70
  )
77
71
  return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
78
72
 
79
73
  def get_hash(self, extras: Optional[list[str]]) -> str:
80
- hashable_string = json.dumps(
81
- self.process.config.to_dict(), sort_keys=True, ensure_ascii=True
74
+ hashable_string = serialize_base_model_json(
75
+ model=self.process.config, sort_keys=True, ensure_ascii=True
82
76
  )
83
77
  if extras:
84
78
  hashable_string += "".join(extras)