unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (125) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/cli.py +6 -1
  3. unstructured_ingest/cli/cmds/__init__.py +4 -4
  4. unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
  5. unstructured_ingest/cli/interfaces.py +13 -6
  6. unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
  7. unstructured_ingest/connector/biomed.py +12 -5
  8. unstructured_ingest/connector/confluence.py +3 -3
  9. unstructured_ingest/connector/github.py +3 -2
  10. unstructured_ingest/connector/google_drive.py +1 -2
  11. unstructured_ingest/connector/mongodb.py +1 -2
  12. unstructured_ingest/connector/notion/client.py +31 -16
  13. unstructured_ingest/connector/notion/connector.py +3 -2
  14. unstructured_ingest/connector/registry.py +2 -2
  15. unstructured_ingest/connector/vectara.py +7 -2
  16. unstructured_ingest/interfaces.py +13 -9
  17. unstructured_ingest/pipeline/interfaces.py +8 -3
  18. unstructured_ingest/pipeline/reformat/chunking.py +13 -9
  19. unstructured_ingest/pipeline/reformat/embedding.py +3 -3
  20. unstructured_ingest/runner/__init__.py +2 -2
  21. unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
  22. unstructured_ingest/runner/writers/__init__.py +2 -2
  23. unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
  24. unstructured_ingest/utils/chunking.py +45 -0
  25. unstructured_ingest/utils/dep_check.py +1 -1
  26. unstructured_ingest/utils/google_filetype.py +9 -0
  27. unstructured_ingest/v2/cli/base/cmd.py +66 -12
  28. unstructured_ingest/v2/cli/base/dest.py +21 -12
  29. unstructured_ingest/v2/cli/base/src.py +35 -21
  30. unstructured_ingest/v2/cli/cmds.py +14 -0
  31. unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
  32. unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
  33. unstructured_ingest/v2/interfaces/__init__.py +2 -1
  34. unstructured_ingest/v2/interfaces/connector.py +5 -7
  35. unstructured_ingest/v2/interfaces/downloader.py +17 -8
  36. unstructured_ingest/v2/interfaces/file_data.py +13 -2
  37. unstructured_ingest/v2/interfaces/indexer.py +3 -4
  38. unstructured_ingest/v2/interfaces/process.py +3 -4
  39. unstructured_ingest/v2/interfaces/processor.py +10 -10
  40. unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
  41. unstructured_ingest/v2/interfaces/uploader.py +3 -3
  42. unstructured_ingest/v2/pipeline/interfaces.py +3 -5
  43. unstructured_ingest/v2/pipeline/pipeline.py +73 -7
  44. unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
  45. unstructured_ingest/v2/pipeline/steps/download.py +90 -24
  46. unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
  47. unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
  48. unstructured_ingest/v2/pipeline/steps/index.py +14 -10
  49. unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
  50. unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
  51. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
  52. unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
  53. unstructured_ingest/v2/processes/__init__.py +18 -0
  54. unstructured_ingest/v2/processes/chunker.py +74 -28
  55. unstructured_ingest/v2/processes/connector_registry.py +8 -2
  56. unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
  57. unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
  58. unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
  59. unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
  60. unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
  61. unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
  63. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
  64. unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
  65. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
  66. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
  67. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
  68. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
  69. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
  70. unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
  71. unstructured_ingest/v2/processes/connectors/local.py +36 -28
  72. unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
  73. unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
  74. unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
  75. unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
  76. unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
  77. unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
  78. unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
  79. unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
  80. unstructured_ingest/v2/processes/connectors/sql.py +52 -39
  81. unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
  82. unstructured_ingest/v2/processes/embedder.py +106 -47
  83. unstructured_ingest/v2/processes/filter.py +60 -0
  84. unstructured_ingest/v2/processes/partitioner.py +79 -33
  85. unstructured_ingest/v2/processes/uncompress.py +3 -3
  86. unstructured_ingest/v2/utils.py +45 -0
  87. unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
  88. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
  89. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
  90. unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
  91. unstructured_ingest/v2/cli/cmds/astra.py +0 -85
  92. unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
  93. unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
  94. unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
  95. unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
  96. unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
  97. unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
  98. unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
  99. unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
  100. unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
  101. unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
  102. unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
  103. unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
  104. unstructured_ingest/v2/cli/cmds/local.py +0 -60
  105. unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
  106. unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
  107. unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
  108. unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
  109. unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
  110. unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
  111. unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
  112. unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
  113. unstructured_ingest/v2/cli/cmds/sql.py +0 -84
  114. unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
  115. unstructured_ingest/v2/cli/configs/__init__.py +0 -6
  116. unstructured_ingest/v2/cli/configs/chunk.py +0 -89
  117. unstructured_ingest/v2/cli/configs/embed.py +0 -74
  118. unstructured_ingest/v2/cli/configs/partition.py +0 -99
  119. unstructured_ingest/v2/cli/configs/processor.py +0 -88
  120. unstructured_ingest/v2/cli/interfaces.py +0 -27
  121. unstructured_ingest/v2/pipeline/utils.py +0 -15
  122. unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
  123. /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
  124. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
  125. {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
@@ -1,16 +1,12 @@
1
1
  import json
2
2
  import os.path
3
- import sys
4
- from dataclasses import fields, is_dataclass
5
3
  from gettext import gettext, ngettext
6
4
  from gettext import gettext as _
7
5
  from pathlib import Path
8
- from typing import Any, ForwardRef, Optional, Type, TypeVar, Union, get_args, get_origin
6
+ from typing import Any, Optional, Type, TypeVar
9
7
 
10
8
  import click
11
-
12
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
13
- from unstructured_ingest.v2.logger import logger
9
+ from pydantic import BaseModel, ConfigDict, Secret
14
10
 
15
11
 
16
12
  def conform_click_options(options: dict):
@@ -30,7 +26,13 @@ class Dict(click.ParamType):
30
26
  ctx: Optional[click.Context] = None,
31
27
  ) -> Any:
32
28
  try:
33
- return json.loads(value)
29
+ if isinstance(value, dict):
30
+ return value
31
+ if isinstance(value, Path) and value.is_file():
32
+ with value.open() as f:
33
+ return json.load(f)
34
+ if isinstance(value, str):
35
+ return json.loads(value)
34
36
  except json.JSONDecodeError:
35
37
  self.fail(
36
38
  gettext(
@@ -107,86 +109,33 @@ class DelimitedString(click.ParamType):
107
109
  return split
108
110
 
109
111
 
110
- EnhancedDataClassJsonMixinT = TypeVar(
111
- "EnhancedDataClassJsonMixinT", bound=EnhancedDataClassJsonMixin
112
- )
113
-
114
-
115
- def extract_config(
116
- flat_data: dict, config: Type[EnhancedDataClassJsonMixinT]
117
- ) -> EnhancedDataClassJsonMixinT:
118
- """
119
- To be able to extract a nested dataclass from a flat dictionary (as in one coming
120
- from a click-based options input), the config class is dynamically looked through for
121
- nested dataclass fields and new nested dictionaries are created to conform to the
122
- shape the overall class expects when parsing from a dict. During the process, this will create
123
- copies of the original dictionary to avoid pruning fields but this isn't a
124
- problem since the `from_dict()` method ignores unneeded values.
125
-
126
- Not handling more complex edge cases for now such as nested types i.e Union[List[List[...]]]
127
- """
128
-
129
- def conform_dict(inner_d: dict, inner_config: Type[EnhancedDataClassJsonMixinT]):
130
- # Catch edge cases (i.e. Dict[str, ...]) where underlying type is not a concrete Class,
131
- # causing 'issubclass() arg 1 must be a class' errors, return False
132
- def is_subclass(instance, class_type) -> bool:
133
- try:
134
- return issubclass(instance, class_type)
135
- except Exception:
136
- return False
137
-
138
- dd = inner_d.copy()
139
- for field in fields(inner_config):
140
- f_type = field.type
141
- # typing can be defined using a string, in which case it needs to be resolved
142
- # to the actual type. following logic is cherry picked from the typing
143
- # get_type_hints() since type resolution can be expensive, only do it
144
- # when the type is a string
145
- if isinstance(f_type, str):
146
- try:
147
- base_globals = sys.modules[inner_config.__module__].__dict__
148
- for_ref = ForwardRef(f_type, is_argument=False, is_class=True)
149
- f_type = for_ref._evaluate(
150
- globalns=base_globals, localns=None, recursive_guard=frozenset()
151
- )
152
- except NameError as e:
153
- logger.warning(f"couldn't resolve type {f_type}: {e}")
154
- # Handle the case where the type of a value if a Union (possibly optional)
155
- if get_origin(f_type) is Union:
156
- union_values = get_args(f_type)
157
- # handle List types
158
- union_values = [
159
- get_args(u)[0] if get_origin(u) is list else u for u in union_values
160
- ]
161
- # Ignore injected NoneType when optional
162
- concrete_union_values = [v for v in union_values if not is_subclass(v, type(None))]
163
- dataclass_union_values = [v for v in concrete_union_values if is_dataclass(v)]
164
- non_dataclass_union_values = [
165
- v for v in concrete_union_values if not is_dataclass(v)
166
- ]
167
- if not dataclass_union_values:
168
- continue
169
- # Check if the key for this field already exists in the dictionary,
170
- # if so it might map to one of these non dataclass fields and this
171
- # can't be enforced
172
- if non_dataclass_union_values and field.name in dd:
173
- continue
174
- if len(dataclass_union_values) > 1:
175
- logger.warning(
176
- "more than one dataclass type possible for field {}, "
177
- "not extracting: {}".format(field.name, ", ".join(dataclass_union_values))
178
- )
179
- continue
180
- f_type = dataclass_union_values[0]
181
- origin = get_origin(f_type)
182
- if origin:
183
- f_type = origin
184
- if is_subclass(f_type, EnhancedDataClassJsonMixin):
185
- dd[field.name] = conform_dict(inner_d=dd, inner_config=f_type)
186
- return dd
187
-
188
- adjusted_dict = conform_dict(inner_d=flat_data, inner_config=config)
189
- return config.from_dict(adjusted_dict, apply_name_overload=False)
112
+ BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
113
+
114
+
115
+ def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
116
+ fields = config.model_fields
117
+ config.model_config = ConfigDict(extra="ignore")
118
+ field_names = [v.alias or k for k, v in fields.items()]
119
+ data = {k: v for k, v in flat_data.items() if k in field_names and v is not None}
120
+ if access_config := fields.get("access_config"):
121
+ access_config_type = access_config.annotation
122
+ # Check if raw type is wrapped by a secret
123
+ if (
124
+ hasattr(access_config_type, "__origin__")
125
+ and hasattr(access_config_type, "__args__")
126
+ and access_config_type.__origin__ is Secret
127
+ ):
128
+ ac_subtypes = access_config_type.__args__
129
+ ac_fields = ac_subtypes[0].model_fields
130
+ elif issubclass(access_config_type, BaseModel):
131
+ ac_fields = access_config_type.model_fields
132
+ else:
133
+ raise TypeError(f"Unrecognized access_config type: {access_config_type}")
134
+ ac_field_names = [v.alias or k for k, v in ac_fields.items()]
135
+ data["access_config"] = {
136
+ k: v for k, v in flat_data.items() if k in ac_field_names and v is not None
137
+ }
138
+ return config.model_validate(obj=data)
190
139
 
191
140
 
192
141
  class Group(click.Group):
@@ -195,13 +144,11 @@ class Group(click.Group):
195
144
  This allows for subcommands to be called with the --help flag without breaking
196
145
  if parent command is missing any of its required parameters
197
146
  """
198
-
199
147
  try:
200
148
  return super().parse_args(ctx, args)
201
149
  except click.MissingParameter:
202
150
  if "--help" not in args:
203
151
  raise
204
-
205
152
  # remove the required params so that help can display
206
153
  for param in self.params:
207
154
  param.required = False
@@ -0,0 +1,199 @@
1
+ import contextlib
2
+ import datetime
3
+ from collections import Counter
4
+ from enum import EnumMeta
5
+ from pathlib import Path
6
+ from typing import Any, Callable, Literal, Optional, Type, TypedDict, Union, get_args, get_origin
7
+ from uuid import UUID
8
+
9
+ import click
10
+ from annotated_types import Ge, Gt, Le, Lt, SupportsGe, SupportsGt, SupportsLe, SupportsLt
11
+ from click import Option
12
+ from pydantic import BaseModel, Secret, SecretStr
13
+ from pydantic.fields import FieldInfo
14
+ from pydantic.types import _SecretBase
15
+ from pydantic_core import PydanticUndefined
16
+
17
+ from unstructured_ingest.v2.cli.utils.click import DelimitedString, Dict
18
+
19
+ NoneType = type(None)
20
+
21
+
22
+ class _RangeDict(TypedDict, total=False):
23
+ """Represent arguments to `click.IntRange` or `click.FloatRange`."""
24
+
25
+ max: Union[SupportsLt, SupportsLe]
26
+ min: Union[SupportsGt, SupportsGe]
27
+ max_open: bool
28
+ min_open: bool
29
+
30
+
31
+ def get_range_from_metadata(metadata: list[Any]) -> _RangeDict:
32
+ range_args: _RangeDict = {}
33
+ for constraint in metadata:
34
+ if isinstance(constraint, Le):
35
+ range_args["max"] = constraint.le
36
+ range_args["max_open"] = False
37
+ if isinstance(constraint, Lt):
38
+ range_args["max"] = constraint.lt
39
+ range_args["max_open"] = True
40
+ if isinstance(constraint, Ge):
41
+ range_args["min"] = constraint.ge
42
+ range_args["min_open"] = False
43
+ if isinstance(constraint, Gt):
44
+ range_args["min"] = constraint.gt
45
+ range_args["min_open"] = True
46
+ return range_args
47
+
48
+
49
+ def is_boolean_flag(field_info: FieldInfo) -> bool:
50
+ annotation = field_info.annotation
51
+ raw_annotation = get_raw_type(annotation)
52
+ return raw_annotation is bool
53
+
54
+
55
+ def get_raw_type(val: Any) -> Any:
56
+ field_args = get_args(val)
57
+ field_origin = get_origin(val)
58
+ if field_origin is Union and len(field_args) == 2 and NoneType in field_args:
59
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
60
+ return field_type
61
+ if field_origin is Secret and len(field_args) == 1:
62
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
63
+ return field_type
64
+ if val is SecretStr:
65
+ return str
66
+ return val
67
+
68
+
69
+ def get_default_value_from_field(field: FieldInfo) -> Optional[Union[Any, Callable[[], Any]]]:
70
+ if field.default is not PydanticUndefined:
71
+ return field.default
72
+ elif field.default_factory is not None:
73
+ return field.default_factory
74
+ return None
75
+
76
+
77
+ def get_option_name(field_name: str, field_info: FieldInfo) -> str:
78
+ field_name = field_info.alias or field_name
79
+ if field_name.startswith("--"):
80
+ field_name = field_name[2:]
81
+ field_name = field_name.lower().replace("_", "-")
82
+ if is_boolean_flag(field_info):
83
+ return f"--{field_name}/--no-{field_name}"
84
+ return f"--{field_name}"
85
+
86
+
87
+ def get_numerical_type(field: FieldInfo) -> click.ParamType:
88
+ range_args = get_range_from_metadata(field.metadata)
89
+ if field.annotation is int:
90
+ if range_args:
91
+ return click.IntRange(**range_args) # type: ignore[arg-type]
92
+ return click.INT
93
+ # Non-integer numerical types default to float
94
+ if range_args:
95
+ return click.FloatRange(**range_args) # type: ignore[arg-type]
96
+ return click.FLOAT
97
+
98
+
99
+ def get_type_from_annotation(field_type: Any) -> click.ParamType:
100
+ field_origin = get_origin(field_type)
101
+ field_args = get_args(field_type)
102
+ if field_origin is Union and len(field_args) == 2 and NoneType in field_args:
103
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
104
+ return get_type_from_annotation(field_type=field_type)
105
+ if field_origin is Secret and len(field_args) == 1:
106
+ field_type = next(field_arg for field_arg in field_args if field_arg is not None)
107
+ return get_type_from_annotation(field_type=field_type)
108
+ if field_origin is list and len(field_args) == 1 and field_args[0] is str:
109
+ return DelimitedString()
110
+ if field_type is SecretStr:
111
+ return click.STRING
112
+ if dict in [field_type, field_origin]:
113
+ return Dict()
114
+ if field_type is str:
115
+ return click.STRING
116
+ if field_type is bool:
117
+ return click.BOOL
118
+ if field_type is UUID:
119
+ return click.UUID
120
+ if field_type is Path:
121
+ return click.Path(path_type=Path)
122
+ if field_type in (datetime.datetime, datetime.date):
123
+ return click.DateTime()
124
+ if field_origin is Literal:
125
+ return click.Choice(field_args)
126
+ if isinstance(field_type, EnumMeta):
127
+ values = [i.value for i in field_type]
128
+ return click.Choice(values)
129
+ raise TypeError(f"Unexpected field type: {field_type}")
130
+
131
+
132
+ def _get_type_from_field(field: FieldInfo) -> click.ParamType:
133
+ raw_field_type = get_raw_type(field.annotation)
134
+
135
+ if raw_field_type in (int, float):
136
+ return get_numerical_type(field)
137
+ return get_type_from_annotation(field_type=field.annotation)
138
+
139
+
140
+ def get_option_from_field(option_name: str, field_info: FieldInfo) -> Option:
141
+ param_decls = [option_name]
142
+ help = field_info.description or ""
143
+ if examples := field_info.examples:
144
+ help += f" [Examples: {', '.join(examples)}]"
145
+ option_kwargs = {
146
+ "type": _get_type_from_field(field_info),
147
+ "default": get_default_value_from_field(field_info),
148
+ "required": field_info.is_required(),
149
+ "help": help,
150
+ "is_flag": is_boolean_flag(field_info),
151
+ "show_default": field_info.default is not PydanticUndefined,
152
+ }
153
+ return click.Option(param_decls=param_decls, **option_kwargs)
154
+
155
+
156
+ def is_subclass(x: Any, y: Any) -> bool:
157
+ with contextlib.suppress(TypeError):
158
+ return issubclass(x, y)
159
+
160
+ return False
161
+
162
+
163
+ def post_check(options: list[Option]):
164
+ option_names = [option.name for option in options]
165
+ duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
166
+ if duplicate_names:
167
+ raise ValueError(
168
+ "the following field name were reused, all must be unique: {}".format(
169
+ ", ".join(duplicate_names)
170
+ )
171
+ )
172
+
173
+
174
+ def is_secret(value: Any) -> bool:
175
+ # Case Secret[int]
176
+ if hasattr(value, "__origin__") and hasattr(value, "__args__"):
177
+ origin = value.__origin__
178
+ return is_subclass(origin, _SecretBase)
179
+ # Case SecretStr
180
+ return is_subclass(value, _SecretBase)
181
+
182
+
183
+ def options_from_base_model(model: Union[BaseModel, Type[BaseModel]]) -> list[Option]:
184
+ options = []
185
+ model_fields = model.model_fields
186
+ for field_name, field_info in model_fields.items():
187
+ if field_info.init is False:
188
+ continue
189
+ option_name = get_option_name(field_name=field_name, field_info=field_info)
190
+ raw_annotation = get_raw_type(field_info.annotation)
191
+ if is_subclass(raw_annotation, BaseModel):
192
+ options.extend(options_from_base_model(model=raw_annotation))
193
+ else:
194
+ if is_secret(field_info.annotation):
195
+ field_info.description = f"[sensitive] {field_info.description}"
196
+ options.append(get_option_from_field(option_name=option_name, field_info=field_info))
197
+
198
+ post_check(options=options)
199
+ return options
@@ -1,6 +1,6 @@
1
1
  from .connector import AccessConfig, BaseConnector, ConnectionConfig
2
2
  from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
3
- from .file_data import FileData, SourceIdentifiers
3
+ from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
4
4
  from .indexer import Indexer, IndexerConfig
5
5
  from .process import BaseProcess
6
6
  from .processor import ProcessorConfig
@@ -26,4 +26,5 @@ __all__ = [
26
26
  "AccessConfig",
27
27
  "ConnectionConfig",
28
28
  "BaseConnector",
29
+ "FileDataSourceMetadata",
29
30
  ]
@@ -2,11 +2,10 @@ from abc import ABC
2
2
  from dataclasses import dataclass
3
3
  from typing import Any, TypeVar
4
4
 
5
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
5
+ from pydantic import BaseModel, Secret
6
6
 
7
7
 
8
- @dataclass
9
- class AccessConfig(EnhancedDataClassJsonMixin):
8
+ class AccessConfig(BaseModel):
10
9
  """Meant to designate holding any sensitive information associated with other configs
11
10
  and also for access specific configs."""
12
11
 
@@ -14,14 +13,13 @@ class AccessConfig(EnhancedDataClassJsonMixin):
14
13
  AccessConfigT = TypeVar("AccessConfigT", bound=AccessConfig)
15
14
 
16
15
 
17
- @dataclass
18
- class ConnectionConfig(EnhancedDataClassJsonMixin):
19
- access_config: AccessConfigT
16
+ class ConnectionConfig(BaseModel):
17
+ access_config: Secret[AccessConfigT]
20
18
 
21
19
  def get_access_config(self) -> dict[str, Any]:
22
20
  if not self.access_config:
23
21
  return {}
24
- return self.access_config.to_dict(apply_name_overload=False)
22
+ return self.access_config.get_secret_value().dict()
25
23
 
26
24
 
27
25
  ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
@@ -1,18 +1,21 @@
1
1
  import os
2
2
  from abc import ABC, abstractmethod
3
- from dataclasses import dataclass
4
3
  from pathlib import Path
5
4
  from typing import Any, Optional, TypedDict, TypeVar, Union
6
5
 
7
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+ from pydantic import BaseModel, Field
7
+
8
8
  from unstructured_ingest.v2.interfaces.connector import BaseConnector
9
9
  from unstructured_ingest.v2.interfaces.file_data import FileData
10
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
11
11
 
12
12
 
13
- @dataclass
14
- class DownloaderConfig(EnhancedDataClassJsonMixin):
15
- download_dir: Optional[Path] = None
13
+ class DownloaderConfig(BaseModel):
14
+ download_dir: Optional[Path] = Field(
15
+ default=None,
16
+ description="Where files are downloaded to, defaults to a location at"
17
+ "`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
18
+ )
16
19
 
17
20
 
18
21
  DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
@@ -30,6 +33,15 @@ class Downloader(BaseProcess, BaseConnector, ABC):
30
33
  connector_type: str
31
34
  download_config: DownloaderConfigT
32
35
 
36
+ def get_download_path(self, file_data: FileData) -> Optional[Path]:
37
+ if not file_data.source_identifiers:
38
+ return None
39
+ rel_path = file_data.source_identifiers.relative_path
40
+ if not rel_path:
41
+ return None
42
+ rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
43
+ return self.download_dir / Path(rel_path)
44
+
33
45
  @staticmethod
34
46
  def is_float(value: str):
35
47
  try:
@@ -68,9 +80,6 @@ class Downloader(BaseProcess, BaseConnector, ABC):
68
80
  def is_async(self) -> bool:
69
81
  return True
70
82
 
71
- def get_download_path(self, file_data: FileData) -> Optional[Path]:
72
- return None
73
-
74
83
  @abstractmethod
75
84
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
76
85
  pass
@@ -4,7 +4,6 @@ from pathlib import Path
4
4
  from typing import Any, Literal, Optional
5
5
 
6
6
  from dataclasses_json import DataClassJsonMixin
7
- from unstructured.documents.elements import DataSourceMetadata
8
7
 
9
8
 
10
9
  @dataclass
@@ -22,13 +21,25 @@ class SourceIdentifiers:
22
21
  return self.rel_path or self.fullpath
23
22
 
24
23
 
24
+ @dataclass
25
+ class FileDataSourceMetadata(DataClassJsonMixin):
26
+ url: Optional[str] = None
27
+ version: Optional[str] = None
28
+ record_locator: Optional[dict[str, Any]] = None
29
+ date_created: Optional[str] = None
30
+ date_modified: Optional[str] = None
31
+ date_processed: Optional[str] = None
32
+ permissions_data: Optional[list[dict[str, Any]]] = None
33
+ filesize_bytes: Optional[int] = None
34
+
35
+
25
36
  @dataclass
26
37
  class FileData(DataClassJsonMixin):
27
38
  identifier: str
28
39
  connector_type: str
29
40
  source_identifiers: Optional[SourceIdentifiers] = None
30
41
  doc_type: Literal["file", "batch"] = field(default="file")
31
- metadata: DataSourceMetadata = field(default_factory=DataSourceMetadata)
42
+ metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
32
43
  additional_metadata: dict[str, Any] = field(default_factory=dict)
33
44
  reprocess: bool = False
34
45
 
@@ -1,15 +1,14 @@
1
1
  from abc import ABC, abstractmethod
2
- from dataclasses import dataclass
3
2
  from typing import Any, Generator, Optional, TypeVar
4
3
 
5
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
4
+ from pydantic import BaseModel
5
+
6
6
  from unstructured_ingest.v2.interfaces.connector import BaseConnector
7
7
  from unstructured_ingest.v2.interfaces.file_data import FileData
8
8
  from unstructured_ingest.v2.interfaces.process import BaseProcess
9
9
 
10
10
 
11
- @dataclass
12
- class IndexerConfig(EnhancedDataClassJsonMixin):
11
+ class IndexerConfig(BaseModel):
13
12
  pass
14
13
 
15
14
 
@@ -8,13 +8,12 @@ class BaseProcess(ABC):
8
8
  def is_async(self) -> bool:
9
9
  return False
10
10
 
11
+ def precheck(self) -> None:
12
+ pass
13
+
11
14
  @abstractmethod
12
15
  def run(self, **kwargs: Any) -> Any:
13
16
  pass
14
17
 
15
18
  async def run_async(self, **kwargs: Any) -> Any:
16
19
  return self.run(**kwargs)
17
-
18
- def check_connection(self):
19
- # If the process requires external connections, run a quick check
20
- pass
@@ -1,24 +1,24 @@
1
1
  import os
2
2
  from asyncio import Semaphore
3
- from dataclasses import dataclass, field
4
3
  from pathlib import Path
5
- from typing import Optional
4
+ from typing import Any, Optional
6
5
 
7
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+ from pydantic import BaseModel, ConfigDict, Field
8
7
 
9
8
  DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
10
9
 
11
10
 
12
- @dataclass
13
- class ProcessorConfig(EnhancedDataClassJsonMixin):
11
+ class ProcessorConfig(BaseModel):
12
+ model_config = ConfigDict(arbitrary_types_allowed=True)
13
+
14
14
  reprocess: bool = False
15
15
  verbose: bool = False
16
16
  tqdm: bool = False
17
- work_dir: str = field(default_factory=lambda: DEFAULT_WORK_DIR)
17
+ work_dir: str = Field(default_factory=lambda: DEFAULT_WORK_DIR)
18
18
  num_processes: int = 2
19
19
  max_connections: Optional[int] = None
20
20
  raise_on_error: bool = False
21
- disable_parallelism: bool = field(
21
+ disable_parallelism: bool = Field(
22
22
  default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
23
23
  )
24
24
  preserve_downloads: bool = False
@@ -28,10 +28,10 @@ class ProcessorConfig(EnhancedDataClassJsonMixin):
28
28
  uncompress: bool = False
29
29
 
30
30
  # Used to keep track of state in pipeline
31
- status: dict = field(default_factory=dict)
32
- semaphore: Optional[Semaphore] = field(init=False, default=None)
31
+ status: dict = Field(default_factory=dict)
32
+ semaphore: Optional[Semaphore] = Field(init=False, default=None)
33
33
 
34
- def __post_init__(self):
34
+ def model_post_init(self, __context: Any) -> None:
35
35
  if self.max_connections is not None:
36
36
  self.semaphore = Semaphore(self.max_connections)
37
37
 
@@ -3,13 +3,13 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Any, TypeVar
5
5
 
6
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+ from pydantic import BaseModel
7
+
7
8
  from unstructured_ingest.v2.interfaces.file_data import FileData
8
9
  from unstructured_ingest.v2.interfaces.process import BaseProcess
9
10
 
10
11
 
11
- @dataclass
12
- class UploadStagerConfig(EnhancedDataClassJsonMixin):
12
+ class UploadStagerConfig(BaseModel):
13
13
  pass
14
14
 
15
15
 
@@ -3,14 +3,14 @@ from dataclasses import dataclass
3
3
  from pathlib import Path
4
4
  from typing import Any, TypeVar
5
5
 
6
- from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
6
+ from pydantic import BaseModel
7
+
7
8
  from unstructured_ingest.v2.interfaces.connector import BaseConnector
8
9
  from unstructured_ingest.v2.interfaces.file_data import FileData
9
10
  from unstructured_ingest.v2.interfaces.process import BaseProcess
10
11
 
11
12
 
12
- @dataclass
13
- class UploaderConfig(EnhancedDataClassJsonMixin):
13
+ class UploaderConfig(BaseModel):
14
14
  pass
15
15
 
16
16
 
@@ -92,7 +92,7 @@ class PipelineStep(ABC):
92
92
 
93
93
  if iterable:
94
94
  if len(iterable) == 1:
95
- return [self.process_serially(iterable)]
95
+ return self.process_serially(iterable)
96
96
  if self.context.num_processes == 1:
97
97
  return self.process_serially(iterable)
98
98
  with mp.Pool(
@@ -126,6 +126,8 @@ class PipelineStep(ABC):
126
126
  logger.info(
127
127
  f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
128
128
  )
129
+ else:
130
+ logger.info(f"Calling {self.__class__.__name__} with no inputs")
129
131
  if self.context.async_supported and self.process.is_async():
130
132
  return self.process_async(iterable=iterable)
131
133
  if self.context.mp_supported:
@@ -146,8 +148,6 @@ class PipelineStep(ABC):
146
148
  logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
147
149
  if "file_data_path" in kwargs:
148
150
  self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
149
- else:
150
- self.context.status[self.identifier] = {"step_error": str(e)}
151
151
  if self.context.raise_on_error:
152
152
  raise e
153
153
  return None
@@ -160,8 +160,6 @@ class PipelineStep(ABC):
160
160
  logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
161
161
  if "file_data_path" in kwargs:
162
162
  self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
163
- else:
164
- self.context.status[self.identifier] = {"step_error": str(e)}
165
163
  if self.context.raise_on_error:
166
164
  raise e
167
165
  return None