unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +66 -12
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -21
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +17 -8
- unstructured_ingest/v2/interfaces/file_data.py +13 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +73 -7
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +90 -24
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
- unstructured_ingest/v2/processes/connectors/local.py +36 -28
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
- unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
- unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
- unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
- unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +52 -39
- unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -60
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -6
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os.path
|
|
3
|
-
import sys
|
|
4
|
-
from dataclasses import fields, is_dataclass
|
|
5
3
|
from gettext import gettext, ngettext
|
|
6
4
|
from gettext import gettext as _
|
|
7
5
|
from pathlib import Path
|
|
8
|
-
from typing import Any,
|
|
6
|
+
from typing import Any, Optional, Type, TypeVar
|
|
9
7
|
|
|
10
8
|
import click
|
|
11
|
-
|
|
12
|
-
from unstructured_ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
|
|
13
|
-
from unstructured_ingest.v2.logger import logger
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Secret
|
|
14
10
|
|
|
15
11
|
|
|
16
12
|
def conform_click_options(options: dict):
|
|
@@ -30,7 +26,13 @@ class Dict(click.ParamType):
|
|
|
30
26
|
ctx: Optional[click.Context] = None,
|
|
31
27
|
) -> Any:
|
|
32
28
|
try:
|
|
33
|
-
|
|
29
|
+
if isinstance(value, dict):
|
|
30
|
+
return value
|
|
31
|
+
if isinstance(value, Path) and value.is_file():
|
|
32
|
+
with value.open() as f:
|
|
33
|
+
return json.load(f)
|
|
34
|
+
if isinstance(value, str):
|
|
35
|
+
return json.loads(value)
|
|
34
36
|
except json.JSONDecodeError:
|
|
35
37
|
self.fail(
|
|
36
38
|
gettext(
|
|
@@ -107,86 +109,33 @@ class DelimitedString(click.ParamType):
|
|
|
107
109
|
return split
|
|
108
110
|
|
|
109
111
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
)
|
|
118
|
-
""
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
dd = inner_d.copy()
|
|
139
|
-
for field in fields(inner_config):
|
|
140
|
-
f_type = field.type
|
|
141
|
-
# typing can be defined using a string, in which case it needs to be resolved
|
|
142
|
-
# to the actual type. following logic is cherry picked from the typing
|
|
143
|
-
# get_type_hints() since type resolution can be expensive, only do it
|
|
144
|
-
# when the type is a string
|
|
145
|
-
if isinstance(f_type, str):
|
|
146
|
-
try:
|
|
147
|
-
base_globals = sys.modules[inner_config.__module__].__dict__
|
|
148
|
-
for_ref = ForwardRef(f_type, is_argument=False, is_class=True)
|
|
149
|
-
f_type = for_ref._evaluate(
|
|
150
|
-
globalns=base_globals, localns=None, recursive_guard=frozenset()
|
|
151
|
-
)
|
|
152
|
-
except NameError as e:
|
|
153
|
-
logger.warning(f"couldn't resolve type {f_type}: {e}")
|
|
154
|
-
# Handle the case where the type of a value if a Union (possibly optional)
|
|
155
|
-
if get_origin(f_type) is Union:
|
|
156
|
-
union_values = get_args(f_type)
|
|
157
|
-
# handle List types
|
|
158
|
-
union_values = [
|
|
159
|
-
get_args(u)[0] if get_origin(u) is list else u for u in union_values
|
|
160
|
-
]
|
|
161
|
-
# Ignore injected NoneType when optional
|
|
162
|
-
concrete_union_values = [v for v in union_values if not is_subclass(v, type(None))]
|
|
163
|
-
dataclass_union_values = [v for v in concrete_union_values if is_dataclass(v)]
|
|
164
|
-
non_dataclass_union_values = [
|
|
165
|
-
v for v in concrete_union_values if not is_dataclass(v)
|
|
166
|
-
]
|
|
167
|
-
if not dataclass_union_values:
|
|
168
|
-
continue
|
|
169
|
-
# Check if the key for this field already exists in the dictionary,
|
|
170
|
-
# if so it might map to one of these non dataclass fields and this
|
|
171
|
-
# can't be enforced
|
|
172
|
-
if non_dataclass_union_values and field.name in dd:
|
|
173
|
-
continue
|
|
174
|
-
if len(dataclass_union_values) > 1:
|
|
175
|
-
logger.warning(
|
|
176
|
-
"more than one dataclass type possible for field {}, "
|
|
177
|
-
"not extracting: {}".format(field.name, ", ".join(dataclass_union_values))
|
|
178
|
-
)
|
|
179
|
-
continue
|
|
180
|
-
f_type = dataclass_union_values[0]
|
|
181
|
-
origin = get_origin(f_type)
|
|
182
|
-
if origin:
|
|
183
|
-
f_type = origin
|
|
184
|
-
if is_subclass(f_type, EnhancedDataClassJsonMixin):
|
|
185
|
-
dd[field.name] = conform_dict(inner_d=dd, inner_config=f_type)
|
|
186
|
-
return dd
|
|
187
|
-
|
|
188
|
-
adjusted_dict = conform_dict(inner_d=flat_data, inner_config=config)
|
|
189
|
-
return config.from_dict(adjusted_dict, apply_name_overload=False)
|
|
112
|
+
BaseModelT = TypeVar("BaseModelT", bound=BaseModel)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def extract_config(flat_data: dict, config: Type[BaseModelT]) -> BaseModelT:
|
|
116
|
+
fields = config.model_fields
|
|
117
|
+
config.model_config = ConfigDict(extra="ignore")
|
|
118
|
+
field_names = [v.alias or k for k, v in fields.items()]
|
|
119
|
+
data = {k: v for k, v in flat_data.items() if k in field_names and v is not None}
|
|
120
|
+
if access_config := fields.get("access_config"):
|
|
121
|
+
access_config_type = access_config.annotation
|
|
122
|
+
# Check if raw type is wrapped by a secret
|
|
123
|
+
if (
|
|
124
|
+
hasattr(access_config_type, "__origin__")
|
|
125
|
+
and hasattr(access_config_type, "__args__")
|
|
126
|
+
and access_config_type.__origin__ is Secret
|
|
127
|
+
):
|
|
128
|
+
ac_subtypes = access_config_type.__args__
|
|
129
|
+
ac_fields = ac_subtypes[0].model_fields
|
|
130
|
+
elif issubclass(access_config_type, BaseModel):
|
|
131
|
+
ac_fields = access_config_type.model_fields
|
|
132
|
+
else:
|
|
133
|
+
raise TypeError(f"Unrecognized access_config type: {access_config_type}")
|
|
134
|
+
ac_field_names = [v.alias or k for k, v in ac_fields.items()]
|
|
135
|
+
data["access_config"] = {
|
|
136
|
+
k: v for k, v in flat_data.items() if k in ac_field_names and v is not None
|
|
137
|
+
}
|
|
138
|
+
return config.model_validate(obj=data)
|
|
190
139
|
|
|
191
140
|
|
|
192
141
|
class Group(click.Group):
|
|
@@ -195,13 +144,11 @@ class Group(click.Group):
|
|
|
195
144
|
This allows for subcommands to be called with the --help flag without breaking
|
|
196
145
|
if parent command is missing any of its required parameters
|
|
197
146
|
"""
|
|
198
|
-
|
|
199
147
|
try:
|
|
200
148
|
return super().parse_args(ctx, args)
|
|
201
149
|
except click.MissingParameter:
|
|
202
150
|
if "--help" not in args:
|
|
203
151
|
raise
|
|
204
|
-
|
|
205
152
|
# remove the required params so that help can display
|
|
206
153
|
for param in self.params:
|
|
207
154
|
param.required = False
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import datetime
|
|
3
|
+
from collections import Counter
|
|
4
|
+
from enum import EnumMeta
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Callable, Literal, Optional, Type, TypedDict, Union, get_args, get_origin
|
|
7
|
+
from uuid import UUID
|
|
8
|
+
|
|
9
|
+
import click
|
|
10
|
+
from annotated_types import Ge, Gt, Le, Lt, SupportsGe, SupportsGt, SupportsLe, SupportsLt
|
|
11
|
+
from click import Option
|
|
12
|
+
from pydantic import BaseModel, Secret, SecretStr
|
|
13
|
+
from pydantic.fields import FieldInfo
|
|
14
|
+
from pydantic.types import _SecretBase
|
|
15
|
+
from pydantic_core import PydanticUndefined
|
|
16
|
+
|
|
17
|
+
from unstructured_ingest.v2.cli.utils.click import DelimitedString, Dict
|
|
18
|
+
|
|
19
|
+
NoneType = type(None)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class _RangeDict(TypedDict, total=False):
|
|
23
|
+
"""Represent arguments to `click.IntRange` or `click.FloatRange`."""
|
|
24
|
+
|
|
25
|
+
max: Union[SupportsLt, SupportsLe]
|
|
26
|
+
min: Union[SupportsGt, SupportsGe]
|
|
27
|
+
max_open: bool
|
|
28
|
+
min_open: bool
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_range_from_metadata(metadata: list[Any]) -> _RangeDict:
|
|
32
|
+
range_args: _RangeDict = {}
|
|
33
|
+
for constraint in metadata:
|
|
34
|
+
if isinstance(constraint, Le):
|
|
35
|
+
range_args["max"] = constraint.le
|
|
36
|
+
range_args["max_open"] = False
|
|
37
|
+
if isinstance(constraint, Lt):
|
|
38
|
+
range_args["max"] = constraint.lt
|
|
39
|
+
range_args["max_open"] = True
|
|
40
|
+
if isinstance(constraint, Ge):
|
|
41
|
+
range_args["min"] = constraint.ge
|
|
42
|
+
range_args["min_open"] = False
|
|
43
|
+
if isinstance(constraint, Gt):
|
|
44
|
+
range_args["min"] = constraint.gt
|
|
45
|
+
range_args["min_open"] = True
|
|
46
|
+
return range_args
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def is_boolean_flag(field_info: FieldInfo) -> bool:
|
|
50
|
+
annotation = field_info.annotation
|
|
51
|
+
raw_annotation = get_raw_type(annotation)
|
|
52
|
+
return raw_annotation is bool
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_raw_type(val: Any) -> Any:
|
|
56
|
+
field_args = get_args(val)
|
|
57
|
+
field_origin = get_origin(val)
|
|
58
|
+
if field_origin is Union and len(field_args) == 2 and NoneType in field_args:
|
|
59
|
+
field_type = next(field_arg for field_arg in field_args if field_arg is not None)
|
|
60
|
+
return field_type
|
|
61
|
+
if field_origin is Secret and len(field_args) == 1:
|
|
62
|
+
field_type = next(field_arg for field_arg in field_args if field_arg is not None)
|
|
63
|
+
return field_type
|
|
64
|
+
if val is SecretStr:
|
|
65
|
+
return str
|
|
66
|
+
return val
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_default_value_from_field(field: FieldInfo) -> Optional[Union[Any, Callable[[], Any]]]:
|
|
70
|
+
if field.default is not PydanticUndefined:
|
|
71
|
+
return field.default
|
|
72
|
+
elif field.default_factory is not None:
|
|
73
|
+
return field.default_factory
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_option_name(field_name: str, field_info: FieldInfo) -> str:
|
|
78
|
+
field_name = field_info.alias or field_name
|
|
79
|
+
if field_name.startswith("--"):
|
|
80
|
+
field_name = field_name[2:]
|
|
81
|
+
field_name = field_name.lower().replace("_", "-")
|
|
82
|
+
if is_boolean_flag(field_info):
|
|
83
|
+
return f"--{field_name}/--no-{field_name}"
|
|
84
|
+
return f"--{field_name}"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_numerical_type(field: FieldInfo) -> click.ParamType:
|
|
88
|
+
range_args = get_range_from_metadata(field.metadata)
|
|
89
|
+
if field.annotation is int:
|
|
90
|
+
if range_args:
|
|
91
|
+
return click.IntRange(**range_args) # type: ignore[arg-type]
|
|
92
|
+
return click.INT
|
|
93
|
+
# Non-integer numerical types default to float
|
|
94
|
+
if range_args:
|
|
95
|
+
return click.FloatRange(**range_args) # type: ignore[arg-type]
|
|
96
|
+
return click.FLOAT
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def get_type_from_annotation(field_type: Any) -> click.ParamType:
|
|
100
|
+
field_origin = get_origin(field_type)
|
|
101
|
+
field_args = get_args(field_type)
|
|
102
|
+
if field_origin is Union and len(field_args) == 2 and NoneType in field_args:
|
|
103
|
+
field_type = next(field_arg for field_arg in field_args if field_arg is not None)
|
|
104
|
+
return get_type_from_annotation(field_type=field_type)
|
|
105
|
+
if field_origin is Secret and len(field_args) == 1:
|
|
106
|
+
field_type = next(field_arg for field_arg in field_args if field_arg is not None)
|
|
107
|
+
return get_type_from_annotation(field_type=field_type)
|
|
108
|
+
if field_origin is list and len(field_args) == 1 and field_args[0] is str:
|
|
109
|
+
return DelimitedString()
|
|
110
|
+
if field_type is SecretStr:
|
|
111
|
+
return click.STRING
|
|
112
|
+
if dict in [field_type, field_origin]:
|
|
113
|
+
return Dict()
|
|
114
|
+
if field_type is str:
|
|
115
|
+
return click.STRING
|
|
116
|
+
if field_type is bool:
|
|
117
|
+
return click.BOOL
|
|
118
|
+
if field_type is UUID:
|
|
119
|
+
return click.UUID
|
|
120
|
+
if field_type is Path:
|
|
121
|
+
return click.Path(path_type=Path)
|
|
122
|
+
if field_type in (datetime.datetime, datetime.date):
|
|
123
|
+
return click.DateTime()
|
|
124
|
+
if field_origin is Literal:
|
|
125
|
+
return click.Choice(field_args)
|
|
126
|
+
if isinstance(field_type, EnumMeta):
|
|
127
|
+
values = [i.value for i in field_type]
|
|
128
|
+
return click.Choice(values)
|
|
129
|
+
raise TypeError(f"Unexpected field type: {field_type}")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _get_type_from_field(field: FieldInfo) -> click.ParamType:
|
|
133
|
+
raw_field_type = get_raw_type(field.annotation)
|
|
134
|
+
|
|
135
|
+
if raw_field_type in (int, float):
|
|
136
|
+
return get_numerical_type(field)
|
|
137
|
+
return get_type_from_annotation(field_type=field.annotation)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def get_option_from_field(option_name: str, field_info: FieldInfo) -> Option:
|
|
141
|
+
param_decls = [option_name]
|
|
142
|
+
help = field_info.description or ""
|
|
143
|
+
if examples := field_info.examples:
|
|
144
|
+
help += f" [Examples: {', '.join(examples)}]"
|
|
145
|
+
option_kwargs = {
|
|
146
|
+
"type": _get_type_from_field(field_info),
|
|
147
|
+
"default": get_default_value_from_field(field_info),
|
|
148
|
+
"required": field_info.is_required(),
|
|
149
|
+
"help": help,
|
|
150
|
+
"is_flag": is_boolean_flag(field_info),
|
|
151
|
+
"show_default": field_info.default is not PydanticUndefined,
|
|
152
|
+
}
|
|
153
|
+
return click.Option(param_decls=param_decls, **option_kwargs)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def is_subclass(x: Any, y: Any) -> bool:
|
|
157
|
+
with contextlib.suppress(TypeError):
|
|
158
|
+
return issubclass(x, y)
|
|
159
|
+
|
|
160
|
+
return False
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def post_check(options: list[Option]):
|
|
164
|
+
option_names = [option.name for option in options]
|
|
165
|
+
duplicate_names = [name for name, count in Counter(option_names).items() if count > 1]
|
|
166
|
+
if duplicate_names:
|
|
167
|
+
raise ValueError(
|
|
168
|
+
"the following field name were reused, all must be unique: {}".format(
|
|
169
|
+
", ".join(duplicate_names)
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def is_secret(value: Any) -> bool:
|
|
175
|
+
# Case Secret[int]
|
|
176
|
+
if hasattr(value, "__origin__") and hasattr(value, "__args__"):
|
|
177
|
+
origin = value.__origin__
|
|
178
|
+
return is_subclass(origin, _SecretBase)
|
|
179
|
+
# Case SecretStr
|
|
180
|
+
return is_subclass(value, _SecretBase)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def options_from_base_model(model: Union[BaseModel, Type[BaseModel]]) -> list[Option]:
|
|
184
|
+
options = []
|
|
185
|
+
model_fields = model.model_fields
|
|
186
|
+
for field_name, field_info in model_fields.items():
|
|
187
|
+
if field_info.init is False:
|
|
188
|
+
continue
|
|
189
|
+
option_name = get_option_name(field_name=field_name, field_info=field_info)
|
|
190
|
+
raw_annotation = get_raw_type(field_info.annotation)
|
|
191
|
+
if is_subclass(raw_annotation, BaseModel):
|
|
192
|
+
options.extend(options_from_base_model(model=raw_annotation))
|
|
193
|
+
else:
|
|
194
|
+
if is_secret(field_info.annotation):
|
|
195
|
+
field_info.description = f"[sensitive] {field_info.description}"
|
|
196
|
+
options.append(get_option_from_field(option_name=option_name, field_info=field_info))
|
|
197
|
+
|
|
198
|
+
post_check(options=options)
|
|
199
|
+
return options
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from .connector import AccessConfig, BaseConnector, ConnectionConfig
|
|
2
2
|
from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
|
|
3
|
-
from .file_data import FileData, SourceIdentifiers
|
|
3
|
+
from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
4
4
|
from .indexer import Indexer, IndexerConfig
|
|
5
5
|
from .process import BaseProcess
|
|
6
6
|
from .processor import ProcessorConfig
|
|
@@ -26,4 +26,5 @@ __all__ = [
|
|
|
26
26
|
"AccessConfig",
|
|
27
27
|
"ConnectionConfig",
|
|
28
28
|
"BaseConnector",
|
|
29
|
+
"FileDataSourceMetadata",
|
|
29
30
|
]
|
|
@@ -2,11 +2,10 @@ from abc import ABC
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Any, TypeVar
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from pydantic import BaseModel, Secret
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
class AccessConfig(EnhancedDataClassJsonMixin):
|
|
8
|
+
class AccessConfig(BaseModel):
|
|
10
9
|
"""Meant to designate holding any sensitive information associated with other configs
|
|
11
10
|
and also for access specific configs."""
|
|
12
11
|
|
|
@@ -14,14 +13,13 @@ class AccessConfig(EnhancedDataClassJsonMixin):
|
|
|
14
13
|
AccessConfigT = TypeVar("AccessConfigT", bound=AccessConfig)
|
|
15
14
|
|
|
16
15
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
access_config: AccessConfigT
|
|
16
|
+
class ConnectionConfig(BaseModel):
|
|
17
|
+
access_config: Secret[AccessConfigT]
|
|
20
18
|
|
|
21
19
|
def get_access_config(self) -> dict[str, Any]:
|
|
22
20
|
if not self.access_config:
|
|
23
21
|
return {}
|
|
24
|
-
return self.access_config.
|
|
22
|
+
return self.access_config.get_secret_value().dict()
|
|
25
23
|
|
|
26
24
|
|
|
27
25
|
ConnectionConfigT = TypeVar("ConnectionConfigT", bound=ConnectionConfig)
|
|
@@ -1,18 +1,21 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
|
-
from dataclasses import dataclass
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from typing import Any, Optional, TypedDict, TypeVar, Union
|
|
6
5
|
|
|
7
|
-
from
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
8
|
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
9
9
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
10
10
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
class DownloaderConfig(BaseModel):
|
|
14
|
+
download_dir: Optional[Path] = Field(
|
|
15
|
+
default=None,
|
|
16
|
+
description="Where files are downloaded to, defaults to a location at"
|
|
17
|
+
"`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
|
|
18
|
+
)
|
|
16
19
|
|
|
17
20
|
|
|
18
21
|
DownloaderConfigT = TypeVar("DownloaderConfigT", bound=DownloaderConfig)
|
|
@@ -30,6 +33,15 @@ class Downloader(BaseProcess, BaseConnector, ABC):
|
|
|
30
33
|
connector_type: str
|
|
31
34
|
download_config: DownloaderConfigT
|
|
32
35
|
|
|
36
|
+
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
37
|
+
if not file_data.source_identifiers:
|
|
38
|
+
return None
|
|
39
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
40
|
+
if not rel_path:
|
|
41
|
+
return None
|
|
42
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
43
|
+
return self.download_dir / Path(rel_path)
|
|
44
|
+
|
|
33
45
|
@staticmethod
|
|
34
46
|
def is_float(value: str):
|
|
35
47
|
try:
|
|
@@ -68,9 +80,6 @@ class Downloader(BaseProcess, BaseConnector, ABC):
|
|
|
68
80
|
def is_async(self) -> bool:
|
|
69
81
|
return True
|
|
70
82
|
|
|
71
|
-
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
72
|
-
return None
|
|
73
|
-
|
|
74
83
|
@abstractmethod
|
|
75
84
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
76
85
|
pass
|
|
@@ -4,7 +4,6 @@ from pathlib import Path
|
|
|
4
4
|
from typing import Any, Literal, Optional
|
|
5
5
|
|
|
6
6
|
from dataclasses_json import DataClassJsonMixin
|
|
7
|
-
from unstructured.documents.elements import DataSourceMetadata
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
@dataclass
|
|
@@ -22,13 +21,25 @@ class SourceIdentifiers:
|
|
|
22
21
|
return self.rel_path or self.fullpath
|
|
23
22
|
|
|
24
23
|
|
|
24
|
+
@dataclass
|
|
25
|
+
class FileDataSourceMetadata(DataClassJsonMixin):
|
|
26
|
+
url: Optional[str] = None
|
|
27
|
+
version: Optional[str] = None
|
|
28
|
+
record_locator: Optional[dict[str, Any]] = None
|
|
29
|
+
date_created: Optional[str] = None
|
|
30
|
+
date_modified: Optional[str] = None
|
|
31
|
+
date_processed: Optional[str] = None
|
|
32
|
+
permissions_data: Optional[list[dict[str, Any]]] = None
|
|
33
|
+
filesize_bytes: Optional[int] = None
|
|
34
|
+
|
|
35
|
+
|
|
25
36
|
@dataclass
|
|
26
37
|
class FileData(DataClassJsonMixin):
|
|
27
38
|
identifier: str
|
|
28
39
|
connector_type: str
|
|
29
40
|
source_identifiers: Optional[SourceIdentifiers] = None
|
|
30
41
|
doc_type: Literal["file", "batch"] = field(default="file")
|
|
31
|
-
metadata:
|
|
42
|
+
metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
|
|
32
43
|
additional_metadata: dict[str, Any] = field(default_factory=dict)
|
|
33
44
|
reprocess: bool = False
|
|
34
45
|
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from dataclasses import dataclass
|
|
3
2
|
from typing import Any, Generator, Optional, TypeVar
|
|
4
3
|
|
|
5
|
-
from
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
6
|
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
7
7
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
8
8
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
class IndexerConfig(EnhancedDataClassJsonMixin):
|
|
11
|
+
class IndexerConfig(BaseModel):
|
|
13
12
|
pass
|
|
14
13
|
|
|
15
14
|
|
|
@@ -8,13 +8,12 @@ class BaseProcess(ABC):
|
|
|
8
8
|
def is_async(self) -> bool:
|
|
9
9
|
return False
|
|
10
10
|
|
|
11
|
+
def precheck(self) -> None:
|
|
12
|
+
pass
|
|
13
|
+
|
|
11
14
|
@abstractmethod
|
|
12
15
|
def run(self, **kwargs: Any) -> Any:
|
|
13
16
|
pass
|
|
14
17
|
|
|
15
18
|
async def run_async(self, **kwargs: Any) -> Any:
|
|
16
19
|
return self.run(**kwargs)
|
|
17
|
-
|
|
18
|
-
def check_connection(self):
|
|
19
|
-
# If the process requires external connections, run a quick check
|
|
20
|
-
pass
|
|
@@ -1,24 +1,24 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from asyncio import Semaphore
|
|
3
|
-
from dataclasses import dataclass, field
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
from typing import Optional
|
|
4
|
+
from typing import Any, Optional
|
|
6
5
|
|
|
7
|
-
from
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
7
|
|
|
9
8
|
DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pipeline").resolve())
|
|
10
9
|
|
|
11
10
|
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
class ProcessorConfig(BaseModel):
|
|
12
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
13
|
+
|
|
14
14
|
reprocess: bool = False
|
|
15
15
|
verbose: bool = False
|
|
16
16
|
tqdm: bool = False
|
|
17
|
-
work_dir: str =
|
|
17
|
+
work_dir: str = Field(default_factory=lambda: DEFAULT_WORK_DIR)
|
|
18
18
|
num_processes: int = 2
|
|
19
19
|
max_connections: Optional[int] = None
|
|
20
20
|
raise_on_error: bool = False
|
|
21
|
-
disable_parallelism: bool =
|
|
21
|
+
disable_parallelism: bool = Field(
|
|
22
22
|
default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
|
|
23
23
|
)
|
|
24
24
|
preserve_downloads: bool = False
|
|
@@ -28,10 +28,10 @@ class ProcessorConfig(EnhancedDataClassJsonMixin):
|
|
|
28
28
|
uncompress: bool = False
|
|
29
29
|
|
|
30
30
|
# Used to keep track of state in pipeline
|
|
31
|
-
status: dict =
|
|
32
|
-
semaphore: Optional[Semaphore] =
|
|
31
|
+
status: dict = Field(default_factory=dict)
|
|
32
|
+
semaphore: Optional[Semaphore] = Field(init=False, default=None)
|
|
33
33
|
|
|
34
|
-
def
|
|
34
|
+
def model_post_init(self, __context: Any) -> None:
|
|
35
35
|
if self.max_connections is not None:
|
|
36
36
|
self.semaphore = Semaphore(self.max_connections)
|
|
37
37
|
|
|
@@ -3,13 +3,13 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Any, TypeVar
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
7
8
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
8
9
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
class UploadStagerConfig(EnhancedDataClassJsonMixin):
|
|
12
|
+
class UploadStagerConfig(BaseModel):
|
|
13
13
|
pass
|
|
14
14
|
|
|
15
15
|
|
|
@@ -3,14 +3,14 @@ from dataclasses import dataclass
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Any, TypeVar
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
7
8
|
from unstructured_ingest.v2.interfaces.connector import BaseConnector
|
|
8
9
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
9
10
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
class UploaderConfig(EnhancedDataClassJsonMixin):
|
|
13
|
+
class UploaderConfig(BaseModel):
|
|
14
14
|
pass
|
|
15
15
|
|
|
16
16
|
|
|
@@ -92,7 +92,7 @@ class PipelineStep(ABC):
|
|
|
92
92
|
|
|
93
93
|
if iterable:
|
|
94
94
|
if len(iterable) == 1:
|
|
95
|
-
return
|
|
95
|
+
return self.process_serially(iterable)
|
|
96
96
|
if self.context.num_processes == 1:
|
|
97
97
|
return self.process_serially(iterable)
|
|
98
98
|
with mp.Pool(
|
|
@@ -126,6 +126,8 @@ class PipelineStep(ABC):
|
|
|
126
126
|
logger.info(
|
|
127
127
|
f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
128
128
|
)
|
|
129
|
+
else:
|
|
130
|
+
logger.info(f"Calling {self.__class__.__name__} with no inputs")
|
|
129
131
|
if self.context.async_supported and self.process.is_async():
|
|
130
132
|
return self.process_async(iterable=iterable)
|
|
131
133
|
if self.context.mp_supported:
|
|
@@ -146,8 +148,6 @@ class PipelineStep(ABC):
|
|
|
146
148
|
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
147
149
|
if "file_data_path" in kwargs:
|
|
148
150
|
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
149
|
-
else:
|
|
150
|
-
self.context.status[self.identifier] = {"step_error": str(e)}
|
|
151
151
|
if self.context.raise_on_error:
|
|
152
152
|
raise e
|
|
153
153
|
return None
|
|
@@ -160,8 +160,6 @@ class PipelineStep(ABC):
|
|
|
160
160
|
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
161
161
|
if "file_data_path" in kwargs:
|
|
162
162
|
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
163
|
-
else:
|
|
164
|
-
self.context.status[self.identifier] = {"step_error": str(e)}
|
|
165
163
|
if self.context.raise_on_error:
|
|
166
164
|
raise e
|
|
167
165
|
return None
|