unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +57 -13
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +8 -5
- unstructured_ingest/v2/interfaces/file_data.py +8 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/pipeline.py +9 -6
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
- unstructured_ingest/v2/processes/connectors/local.py +27 -16
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
- unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +11 -5
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,15 @@
|
|
|
1
|
-
import enum
|
|
2
1
|
import json
|
|
3
2
|
import uuid
|
|
4
3
|
from dataclasses import dataclass, field
|
|
5
4
|
from datetime import date, datetime
|
|
6
5
|
from pathlib import Path
|
|
7
|
-
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
|
|
8
7
|
|
|
9
8
|
import numpy as np
|
|
10
9
|
import pandas as pd
|
|
11
10
|
from dateutil import parser
|
|
11
|
+
from pydantic import Field, Secret
|
|
12
12
|
|
|
13
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
14
13
|
from unstructured_ingest.error import DestinationConnectionError
|
|
15
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
16
15
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -33,40 +32,41 @@ if TYPE_CHECKING:
|
|
|
33
32
|
|
|
34
33
|
CONNECTOR_TYPE = "sql"
|
|
35
34
|
ELEMENTS_TABLE_NAME = "elements"
|
|
35
|
+
SQLITE_DB = "sqlite"
|
|
36
|
+
POSTGRESQL_DB = "postgresql"
|
|
36
37
|
|
|
37
38
|
|
|
38
|
-
@dataclass
|
|
39
39
|
class SQLAccessConfig(AccessConfig):
|
|
40
|
-
username: Optional[str] = None
|
|
41
|
-
password: Optional[str] = None
|
|
40
|
+
username: Optional[str] = Field(default=None, description="DB username")
|
|
41
|
+
password: Optional[str] = Field(default=None, description="DB password")
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
|
|
45
|
-
SQLITE = "sqlite"
|
|
46
|
-
POSTGRESQL = "postgresql"
|
|
44
|
+
SecreteSQLAccessConfig = Secret[SQLAccessConfig]
|
|
47
45
|
|
|
48
46
|
|
|
49
|
-
@dataclass
|
|
50
47
|
class SQLConnectionConfig(ConnectionConfig):
|
|
51
|
-
db_type:
|
|
52
|
-
|
|
53
|
-
DatabaseType.SQLITE
|
|
48
|
+
db_type: Literal["sqlite", "postgresql"] = Field(
|
|
49
|
+
default=SQLITE_DB, description="Type of the database backend"
|
|
54
50
|
)
|
|
55
|
-
database: Optional[str] =
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
51
|
+
database: Optional[str] = Field(
|
|
52
|
+
default=None,
|
|
53
|
+
description="Database name. For sqlite databases, this is the path to the .db file.",
|
|
54
|
+
)
|
|
55
|
+
host: Optional[str] = Field(default=None, description="DB host")
|
|
56
|
+
port: Optional[int] = Field(default=5432, description="DB host connection port")
|
|
57
|
+
access_config: SecreteSQLAccessConfig = Field(
|
|
58
|
+
default_factory=lambda: SecreteSQLAccessConfig(secret_value=SQLAccessConfig())
|
|
59
|
+
)
|
|
60
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
60
61
|
|
|
61
62
|
def __post_init__(self):
|
|
62
|
-
if (self.db_type ==
|
|
63
|
+
if (self.db_type == SQLITE_DB) and (self.database is None):
|
|
63
64
|
raise ValueError(
|
|
64
65
|
"A sqlite connection requires a path to a *.db file "
|
|
65
66
|
"through the `database` argument"
|
|
66
67
|
)
|
|
67
68
|
|
|
68
69
|
|
|
69
|
-
@dataclass
|
|
70
70
|
class SQLUploadStagerConfig(UploadStagerConfig):
|
|
71
71
|
pass
|
|
72
72
|
|
|
@@ -182,9 +182,8 @@ class SQLUploadStager(UploadStager):
|
|
|
182
182
|
return output_path
|
|
183
183
|
|
|
184
184
|
|
|
185
|
-
@dataclass
|
|
186
185
|
class SQLUploaderConfig(UploaderConfig):
|
|
187
|
-
batch_size: int = 50
|
|
186
|
+
batch_size: int = Field(default=50, description="Number of records per batch")
|
|
188
187
|
|
|
189
188
|
|
|
190
189
|
@dataclass
|
|
@@ -204,9 +203,9 @@ class SQLUploader(Uploader):
|
|
|
204
203
|
|
|
205
204
|
@property
|
|
206
205
|
def connection(self) -> Callable[[], Union["SqliteConnection", "PostgresConnection"]]:
|
|
207
|
-
if self.connection_config.db_type ==
|
|
206
|
+
if self.connection_config.db_type == POSTGRESQL_DB:
|
|
208
207
|
return self._make_psycopg_connection
|
|
209
|
-
elif self.connection_config.db_type ==
|
|
208
|
+
elif self.connection_config.db_type == SQLITE_DB:
|
|
210
209
|
return self._make_sqlite_connection
|
|
211
210
|
raise ValueError(f"Unsupported database {self.connection_config.db_type} connection.")
|
|
212
211
|
|
|
@@ -219,9 +218,10 @@ class SQLUploader(Uploader):
|
|
|
219
218
|
def _make_psycopg_connection(self) -> "PostgresConnection":
|
|
220
219
|
from psycopg2 import connect
|
|
221
220
|
|
|
221
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
222
222
|
return connect(
|
|
223
|
-
user=
|
|
224
|
-
password=
|
|
223
|
+
user=access_config.username,
|
|
224
|
+
password=access_config.password,
|
|
225
225
|
dbname=self.connection_config.database,
|
|
226
226
|
host=self.connection_config.host,
|
|
227
227
|
port=self.connection_config.port,
|
|
@@ -234,9 +234,7 @@ class SQLUploader(Uploader):
|
|
|
234
234
|
for row in data:
|
|
235
235
|
parsed = []
|
|
236
236
|
for column_name, value in zip(columns, row):
|
|
237
|
-
if self.connection_config.db_type ==
|
|
238
|
-
value, (list, dict)
|
|
239
|
-
):
|
|
237
|
+
if self.connection_config.db_type == SQLITE_DB and isinstance(value, (list, dict)):
|
|
240
238
|
value = json.dumps(value)
|
|
241
239
|
if column_name in _DATE_COLUMNS:
|
|
242
240
|
if value is None:
|
|
@@ -255,14 +253,14 @@ class SQLUploader(Uploader):
|
|
|
255
253
|
|
|
256
254
|
columns = tuple(df.columns)
|
|
257
255
|
stmt = f"INSERT INTO {ELEMENTS_TABLE_NAME} ({','.join(columns)}) \
|
|
258
|
-
VALUES({','.join(['?' if self.connection_config.db_type==
|
|
256
|
+
VALUES({','.join(['?' if self.connection_config.db_type==SQLITE_DB else '%s' for x in columns])})" # noqa E501
|
|
259
257
|
|
|
260
258
|
for rows in pd.read_json(
|
|
261
259
|
content.path, orient="records", lines=True, chunksize=self.upload_config.batch_size
|
|
262
260
|
):
|
|
263
261
|
with self.connection() as conn:
|
|
264
262
|
values = self.prepare_data(columns, tuple(rows.itertuples(index=False, name=None)))
|
|
265
|
-
if self.connection_config.db_type ==
|
|
263
|
+
if self.connection_config.db_type == SQLITE_DB:
|
|
266
264
|
conn.executemany(stmt, values)
|
|
267
265
|
else:
|
|
268
266
|
with conn.cursor() as cur:
|
|
@@ -5,8 +5,8 @@ from pathlib import Path
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
|
+
from pydantic import Field, Secret
|
|
8
9
|
|
|
9
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
10
|
from unstructured_ingest.error import DestinationConnectionError
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
12
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -30,27 +30,37 @@ if TYPE_CHECKING:
|
|
|
30
30
|
CONNECTOR_TYPE = "weaviate"
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
@dataclass
|
|
34
33
|
class WeaviateAccessConfig(AccessConfig):
|
|
35
|
-
access_token: Optional[str] =
|
|
34
|
+
access_token: Optional[str] = Field(
|
|
35
|
+
default=None, description="Used to create the bearer token."
|
|
36
|
+
)
|
|
36
37
|
api_key: Optional[str] = None
|
|
37
38
|
client_secret: Optional[str] = None
|
|
38
39
|
password: Optional[str] = None
|
|
39
40
|
|
|
40
41
|
|
|
41
|
-
|
|
42
|
+
SecretWeaviateAccessConfig = Secret[WeaviateAccessConfig]
|
|
43
|
+
|
|
44
|
+
|
|
42
45
|
class WeaviateConnectionConfig(ConnectionConfig):
|
|
43
|
-
host_url: str
|
|
44
|
-
class_name: str
|
|
45
|
-
|
|
46
|
+
host_url: str = Field(description="Weaviate instance url")
|
|
47
|
+
class_name: str = Field(
|
|
48
|
+
description="Name of the class to push the records into, e.g: Pdf-elements"
|
|
49
|
+
)
|
|
50
|
+
access_config: SecretWeaviateAccessConfig = Field(
|
|
51
|
+
default_factory=lambda: SecretWeaviateAccessConfig(secret_value=WeaviateAccessConfig())
|
|
52
|
+
)
|
|
46
53
|
username: Optional[str] = None
|
|
47
|
-
anonymous: bool = False
|
|
54
|
+
anonymous: bool = Field(default=False, description="if set, all auth values will be ignored")
|
|
48
55
|
scope: Optional[list[str]] = None
|
|
49
|
-
refresh_token: Optional[str] =
|
|
50
|
-
|
|
56
|
+
refresh_token: Optional[str] = Field(
|
|
57
|
+
default=None,
|
|
58
|
+
description="Will tie this value to the bearer token. If not provided, "
|
|
59
|
+
"the authentication will expire once the lifetime of the access token is up.",
|
|
60
|
+
)
|
|
61
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
51
62
|
|
|
52
63
|
|
|
53
|
-
@dataclass
|
|
54
64
|
class WeaviateUploadStagerConfig(UploadStagerConfig):
|
|
55
65
|
pass
|
|
56
66
|
|
|
@@ -148,9 +158,8 @@ class WeaviateUploadStager(UploadStager):
|
|
|
148
158
|
return output_path
|
|
149
159
|
|
|
150
160
|
|
|
151
|
-
@dataclass
|
|
152
161
|
class WeaviateUploaderConfig(UploaderConfig):
|
|
153
|
-
batch_size: int = 100
|
|
162
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
154
163
|
|
|
155
164
|
|
|
156
165
|
@dataclass
|
|
@@ -1,76 +1,135 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, Optional
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from unstructured.embed.interfaces import BaseEmbeddingEncoder
|
|
8
|
-
from unstructured.staging.base import elements_from_json
|
|
6
|
+
from pydantic import BaseModel, Field, SecretStr
|
|
9
7
|
|
|
10
|
-
from unstructured_ingest.
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
9
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from unstructured.embed.interfaces import BaseEmbeddingEncoder
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
class EmbedderConfig(
|
|
16
|
-
embedding_provider: Optional[
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
14
|
+
|
|
15
|
+
class EmbedderConfig(BaseModel):
|
|
16
|
+
embedding_provider: Optional[
|
|
17
|
+
Literal[
|
|
18
|
+
"langchain-openai",
|
|
19
|
+
"langchain-huggingface",
|
|
20
|
+
"langchain-aws-bedrock",
|
|
21
|
+
"langchain-vertexai",
|
|
22
|
+
"langchain-voyageai",
|
|
23
|
+
"octoai",
|
|
24
|
+
]
|
|
25
|
+
] = Field(default=None, description="Type of the embedding class to be used.")
|
|
26
|
+
embedding_api_key: Optional[SecretStr] = Field(
|
|
27
|
+
default=None,
|
|
28
|
+
description="API key for the embedding model, for the case an API key is needed.",
|
|
29
|
+
)
|
|
30
|
+
embedding_model_name: Optional[str] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="Embedding model name, if needed. "
|
|
33
|
+
"Chooses a particular LLM between different options, to embed with it.",
|
|
34
|
+
)
|
|
35
|
+
embedding_aws_access_key_id: Optional[str] = Field(
|
|
36
|
+
default=None, description="AWS access key used for AWS-based embedders, such as bedrock"
|
|
37
|
+
)
|
|
38
|
+
embedding_aws_secret_access_key: Optional[SecretStr] = Field(
|
|
39
|
+
default=None, description="AWS secret key used for AWS-based embedders, such as bedrock"
|
|
40
|
+
)
|
|
41
|
+
embedding_aws_region: Optional[str] = Field(
|
|
42
|
+
default="us-west-2", description="AWS region used for AWS-based embedders, such as bedrock"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
@requires_dependencies(dependencies=["unstructured"], extras="embed-huggingface")
|
|
46
|
+
def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
47
|
+
from unstructured.embed.huggingface import (
|
|
48
|
+
HuggingFaceEmbeddingConfig,
|
|
49
|
+
HuggingFaceEmbeddingEncoder,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**embedding_kwargs))
|
|
53
|
+
|
|
54
|
+
@requires_dependencies(dependencies=["unstructured"], extras="openai")
|
|
55
|
+
def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
56
|
+
from unstructured.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
|
|
57
|
+
|
|
58
|
+
return OpenAIEmbeddingEncoder(config=OpenAIEmbeddingConfig(**embedding_kwargs))
|
|
59
|
+
|
|
60
|
+
@requires_dependencies(dependencies=["unstructured"], extras="embed-octoai")
|
|
61
|
+
def get_octoai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
62
|
+
from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
63
|
+
|
|
64
|
+
return OctoAIEmbeddingEncoder(config=OctoAiEmbeddingConfig(**embedding_kwargs))
|
|
65
|
+
|
|
66
|
+
@requires_dependencies(dependencies=["unstructured"], extras="bedrock")
|
|
67
|
+
def get_bedrock_embedder(self) -> "BaseEmbeddingEncoder":
|
|
68
|
+
from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
|
|
69
|
+
|
|
70
|
+
return BedrockEmbeddingEncoder(
|
|
71
|
+
config=BedrockEmbeddingConfig(
|
|
72
|
+
aws_access_key_id=self.embedding_aws_access_key_id,
|
|
73
|
+
aws_secret_access_key=self.embedding_aws_secret_access_key.get_secret_value(),
|
|
74
|
+
region_name=self.embedding_aws_region,
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
@requires_dependencies(dependencies=["unstructured"], extras="embed-vertexai")
|
|
79
|
+
def get_vertexai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
80
|
+
from unstructured.embed.vertexai import (
|
|
81
|
+
VertexAIEmbeddingConfig,
|
|
82
|
+
VertexAIEmbeddingEncoder,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
return VertexAIEmbeddingEncoder(config=VertexAIEmbeddingConfig(**embedding_kwargs))
|
|
86
|
+
|
|
87
|
+
@requires_dependencies(dependencies=["unstructured"], extras="embed-voyageai")
|
|
88
|
+
def get_voyageai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder":
|
|
89
|
+
from unstructured.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
|
|
90
|
+
|
|
91
|
+
return VoyageAIEmbeddingEncoder(config=VoyageAIEmbeddingConfig(**embedding_kwargs))
|
|
92
|
+
|
|
93
|
+
def get_embedder(self) -> "BaseEmbeddingEncoder":
|
|
24
94
|
kwargs: dict[str, Any] = {}
|
|
25
95
|
if self.embedding_api_key:
|
|
26
|
-
kwargs["api_key"] = self.embedding_api_key
|
|
96
|
+
kwargs["api_key"] = self.embedding_api_key.get_secret_value()
|
|
27
97
|
if self.embedding_model_name:
|
|
28
98
|
kwargs["model_name"] = self.embedding_model_name
|
|
29
99
|
# TODO make this more dynamic to map to encoder configs
|
|
30
100
|
if self.embedding_provider == "langchain-openai":
|
|
31
|
-
|
|
101
|
+
return self.get_openai_embedder(embedding_kwargs=kwargs)
|
|
32
102
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
from unstructured.embed.huggingface import (
|
|
36
|
-
HuggingFaceEmbeddingConfig,
|
|
37
|
-
HuggingFaceEmbeddingEncoder,
|
|
38
|
-
)
|
|
103
|
+
if self.embedding_provider == "langchain-huggingface":
|
|
104
|
+
return self.get_huggingface_embedder(embedding_kwargs=kwargs)
|
|
39
105
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
from unstructured.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
|
|
106
|
+
if self.embedding_provider == "octoai":
|
|
107
|
+
return self.get_octoai_embedder(embedding_kwargs=kwargs)
|
|
43
108
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
from unstructured.embed.bedrock import BedrockEmbeddingConfig, BedrockEmbeddingEncoder
|
|
109
|
+
if self.embedding_provider == "langchain-aws-bedrock":
|
|
110
|
+
return self.get_bedrock_embedder()
|
|
47
111
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
aws_access_key_id=self.embedding_aws_access_key_id,
|
|
51
|
-
aws_secret_access_key=self.embedding_aws_secret_access_key,
|
|
52
|
-
region_name=self.embedding_aws_region,
|
|
53
|
-
)
|
|
54
|
-
)
|
|
55
|
-
elif self.embedding_provider == "langchain-vertexai":
|
|
56
|
-
from unstructured.embed.vertexai import (
|
|
57
|
-
VertexAIEmbeddingConfig,
|
|
58
|
-
VertexAIEmbeddingEncoder,
|
|
59
|
-
)
|
|
112
|
+
if self.embedding_provider == "langchain-vertexai":
|
|
113
|
+
return self.get_vertexai_embedder(embedding_kwargs=kwargs)
|
|
60
114
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
115
|
+
if self.embedding_provider == "langchain-voyageai":
|
|
116
|
+
return self.get_voyageai_embedder(embedding_kwargs=kwargs)
|
|
117
|
+
|
|
118
|
+
raise ValueError(f"{self.embedding_provider} not a recognized encoder")
|
|
64
119
|
|
|
65
120
|
|
|
66
121
|
@dataclass
|
|
67
122
|
class Embedder(BaseProcess, ABC):
|
|
68
123
|
config: EmbedderConfig
|
|
69
124
|
|
|
70
|
-
|
|
125
|
+
@requires_dependencies(dependencies=["unstructured"])
|
|
126
|
+
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
127
|
+
from unstructured.staging.base import elements_from_json
|
|
128
|
+
|
|
71
129
|
# TODO update base embedder classes to support async
|
|
72
130
|
embedder = self.config.get_embedder()
|
|
73
131
|
elements = elements_from_json(filename=str(elements_filepath))
|
|
74
132
|
if not elements:
|
|
75
|
-
return elements
|
|
76
|
-
|
|
133
|
+
return [e.to_dict() for e in elements]
|
|
134
|
+
embedded_elements = embedder.embed_documents(elements=elements)
|
|
135
|
+
return [e.to_dict() for e in embedded_elements]
|
|
@@ -3,16 +3,22 @@ from abc import ABC
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import Any, Callable, Optional
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
7
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
8
9
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
class FiltererConfig(BaseModel):
|
|
14
|
+
file_glob: Optional[list[str]] = Field(
|
|
15
|
+
default=None,
|
|
16
|
+
description="file globs to limit which types of " "files are accepted",
|
|
17
|
+
examples=["*.pdf", "*.html"],
|
|
18
|
+
)
|
|
19
|
+
max_file_size: Optional[int] = Field(
|
|
20
|
+
default=None, description="Max file size to process in bytes"
|
|
21
|
+
)
|
|
16
22
|
|
|
17
23
|
|
|
18
24
|
@dataclass
|
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
from abc import ABC
|
|
3
|
-
from dataclasses import dataclass,
|
|
3
|
+
from dataclasses import dataclass, fields
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Optional
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from unstructured.staging.base import elements_to_dicts, flatten_dict
|
|
7
|
+
from pydantic import BaseModel, Field, SecretStr
|
|
9
8
|
|
|
10
|
-
from unstructured_ingest.
|
|
11
|
-
from unstructured_ingest.
|
|
9
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
10
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
13
12
|
from unstructured_ingest.v2.logger import logger
|
|
14
13
|
|
|
@@ -17,25 +16,65 @@ if TYPE_CHECKING:
|
|
|
17
16
|
from unstructured_client.models.shared import PartitionParameters
|
|
18
17
|
|
|
19
18
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
19
|
+
class PartitionerConfig(BaseModel):
|
|
20
|
+
strategy: str = Field(
|
|
21
|
+
default="auto",
|
|
22
|
+
description="The method that will be used to process the documents. ",
|
|
23
|
+
examples=["fast", "hi_res", "auto"],
|
|
24
|
+
)
|
|
25
|
+
ocr_languages: Optional[list[str]] = Field(
|
|
26
|
+
default=None,
|
|
27
|
+
description="A list of language packs to specify which languages to use for OCR, "
|
|
28
|
+
"The appropriate Tesseract language pack needs to be installed.",
|
|
29
|
+
examples=["eng", "deu", "eng,deu"],
|
|
30
|
+
)
|
|
31
|
+
encoding: Optional[str] = Field(
|
|
32
|
+
default=None,
|
|
33
|
+
description="Text encoding to use when reading documents. "
|
|
34
|
+
"By default the encoding is detected automatically.",
|
|
35
|
+
)
|
|
36
|
+
additional_partition_args: Optional[dict[str, Any]] = Field(
|
|
37
|
+
default=None, description="Additional values to pass through to partition()"
|
|
38
|
+
)
|
|
39
|
+
skip_infer_table_types: Optional[list[str]] = Field(
|
|
40
|
+
default=None, description="Optional list of document types to skip table extraction on"
|
|
41
|
+
)
|
|
42
|
+
fields_include: list[str] = Field(
|
|
28
43
|
default_factory=lambda: ["element_id", "text", "type", "metadata", "embeddings"],
|
|
44
|
+
description="If set, include the specified top-level fields in an element.",
|
|
45
|
+
)
|
|
46
|
+
flatten_metadata: bool = Field(
|
|
47
|
+
default=False,
|
|
48
|
+
description="Results in flattened json elements. "
|
|
49
|
+
"Specifically, the metadata key values are brought to "
|
|
50
|
+
"the top-level of the element, and the `metadata` key itself is removed.",
|
|
51
|
+
)
|
|
52
|
+
metadata_exclude: list[str] = Field(
|
|
53
|
+
default_factory=list,
|
|
54
|
+
description="If set, drop the specified metadata " "fields if they exist.",
|
|
29
55
|
)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
56
|
+
metadata_include: list[str] = Field(
|
|
57
|
+
default_factory=list,
|
|
58
|
+
description="If set, include the specified metadata "
|
|
59
|
+
"fields if they exist and drop all other fields. ",
|
|
60
|
+
)
|
|
61
|
+
partition_endpoint: Optional[str] = Field(
|
|
62
|
+
default="https://api.unstructured.io/general/v0/general",
|
|
63
|
+
description="If partitioning via api, use the following host.",
|
|
64
|
+
)
|
|
65
|
+
partition_by_api: bool = Field(
|
|
66
|
+
default=False,
|
|
67
|
+
description="Use a remote API to partition the files."
|
|
68
|
+
" Otherwise, use the function from partition.auto",
|
|
69
|
+
)
|
|
70
|
+
api_key: Optional[SecretStr] = Field(
|
|
71
|
+
default=None, description="API Key for partition endpoint."
|
|
72
|
+
)
|
|
73
|
+
hi_res_model_name: Optional[str] = Field(
|
|
74
|
+
default=None, description="Model name for hi-res strategy."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def model_post_init(self, __context: Any) -> None:
|
|
39
78
|
if self.metadata_exclude and self.metadata_include:
|
|
40
79
|
raise ValueError(
|
|
41
80
|
"metadata_exclude and metadata_include are "
|
|
@@ -93,16 +132,23 @@ class Partitioner(BaseProcess, ABC):
|
|
|
93
132
|
elem.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
|
|
94
133
|
return element_dicts
|
|
95
134
|
|
|
135
|
+
@requires_dependencies(dependencies=["unstructured"])
|
|
96
136
|
def partition_locally(
|
|
97
|
-
self, filename: Path, metadata: Optional[
|
|
137
|
+
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
98
138
|
) -> list[dict]:
|
|
139
|
+
from unstructured.documents.elements import DataSourceMetadata
|
|
99
140
|
from unstructured.partition.auto import partition
|
|
141
|
+
from unstructured.staging.base import elements_to_dicts
|
|
142
|
+
|
|
143
|
+
@dataclass
|
|
144
|
+
class FileDataSourceMetadata(DataSourceMetadata):
|
|
145
|
+
filesize_bytes: Optional[int] = None
|
|
100
146
|
|
|
101
147
|
logger.debug(f"Using local partition with kwargs: {self.config.to_partition_kwargs()}")
|
|
102
|
-
logger.debug(f"partitioning file {filename} with metadata {metadata
|
|
148
|
+
logger.debug(f"partitioning file {filename} with metadata {metadata}")
|
|
103
149
|
elements = partition(
|
|
104
150
|
filename=str(filename.resolve()),
|
|
105
|
-
data_source_metadata=metadata,
|
|
151
|
+
data_source_metadata=FileDataSourceMetadata.from_dict(metadata),
|
|
106
152
|
**self.config.to_partition_kwargs(),
|
|
107
153
|
)
|
|
108
154
|
return self.postprocess(elements=elements_to_dicts(elements))
|
|
@@ -138,29 +184,29 @@ class Partitioner(BaseProcess, ABC):
|
|
|
138
184
|
partition_params = PartitionParameters(**filtered_partition_request)
|
|
139
185
|
return partition_params
|
|
140
186
|
|
|
187
|
+
@requires_dependencies(dependencies=["unstructured_client"], extras="remote")
|
|
141
188
|
async def partition_via_api(
|
|
142
|
-
self, filename: Path, metadata: Optional[
|
|
189
|
+
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
143
190
|
) -> list[dict]:
|
|
144
191
|
from unstructured_client import UnstructuredClient
|
|
145
192
|
|
|
146
|
-
logger.debug(f"partitioning file {filename} with metadata: {metadata
|
|
193
|
+
logger.debug(f"partitioning file {filename} with metadata: {metadata}")
|
|
147
194
|
client = UnstructuredClient(
|
|
148
|
-
server_url=self.config.partition_endpoint,
|
|
195
|
+
server_url=self.config.partition_endpoint,
|
|
196
|
+
api_key_auth=self.config.api_key.get_secret_value(),
|
|
149
197
|
)
|
|
150
198
|
partition_params = self.create_partition_parameters(filename=filename)
|
|
151
199
|
resp = await self.call_api(client=client, request=partition_params)
|
|
152
200
|
elements = resp.elements or []
|
|
153
201
|
# Append the data source metadata the auto partition does for you
|
|
154
202
|
for element in elements:
|
|
155
|
-
element["metadata"]["data_source"] = metadata
|
|
203
|
+
element["metadata"]["data_source"] = metadata
|
|
156
204
|
return self.postprocess(elements=elements)
|
|
157
205
|
|
|
158
|
-
def run(
|
|
159
|
-
self, filename: Path, metadata: Optional[DataSourceMetadata] = None, **kwargs
|
|
160
|
-
) -> list[dict]:
|
|
206
|
+
def run(self, filename: Path, metadata: Optional[dict] = None, **kwargs) -> list[dict]:
|
|
161
207
|
return self.partition_locally(filename, metadata=metadata, **kwargs)
|
|
162
208
|
|
|
163
209
|
async def run_async(
|
|
164
|
-
self, filename: Path, metadata: Optional[
|
|
210
|
+
self, filename: Path, metadata: Optional[dict] = None, **kwargs
|
|
165
211
|
) -> list[dict]:
|
|
166
212
|
return await self.partition_via_api(filename, metadata=metadata, **kwargs)
|
|
@@ -4,14 +4,14 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
8
9
|
from unstructured_ingest.utils.compression import TAR_FILE_EXT, ZIP_FILE_EXT, uncompress_file
|
|
9
10
|
from unstructured_ingest.v2.interfaces import FileData
|
|
10
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
12
|
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
class UncompressConfig(EnhancedDataClassJsonMixin):
|
|
14
|
+
class UncompressConfig(BaseModel):
|
|
15
15
|
pass
|
|
16
16
|
|
|
17
17
|
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from inspect import isclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
from pydantic.types import _SecretBase
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def is_secret(value: Any) -> bool:
|
|
12
|
+
# Case Secret[int]
|
|
13
|
+
if hasattr(value, "__origin__") and hasattr(value, "__args__"):
|
|
14
|
+
origin = value.__origin__
|
|
15
|
+
return isclass(origin) and issubclass(origin, _SecretBase)
|
|
16
|
+
# Case SecretStr
|
|
17
|
+
return isclass(value) and issubclass(value, _SecretBase)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def serialize_base_model(model: BaseModel) -> dict:
|
|
21
|
+
# To get the full serialized dict regardless of if values are marked as Secret
|
|
22
|
+
model_dict = model.dict()
|
|
23
|
+
for k, v in model_dict.items():
|
|
24
|
+
if isinstance(v, _SecretBase):
|
|
25
|
+
secret_value = v.get_secret_value()
|
|
26
|
+
if isinstance(secret_value, BaseModel):
|
|
27
|
+
model_dict[k] = serialize_base_model(model=secret_value)
|
|
28
|
+
else:
|
|
29
|
+
model_dict[k] = secret_value
|
|
30
|
+
|
|
31
|
+
return model_dict
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def serialize_base_model_json(model: BaseModel, **json_kwargs) -> str:
|
|
35
|
+
model_dict = serialize_base_model(model=model)
|
|
36
|
+
|
|
37
|
+
def json_serial(obj):
|
|
38
|
+
if isinstance(obj, Path):
|
|
39
|
+
return obj.as_posix()
|
|
40
|
+
if isinstance(obj, datetime):
|
|
41
|
+
return obj.isoformat()
|
|
42
|
+
raise TypeError("Type %s not serializable" % type(obj))
|
|
43
|
+
|
|
44
|
+
# Support json dumps kwargs such as sort_keys
|
|
45
|
+
return json.dumps(model_dict, default=json_serial, **json_kwargs)
|