unstructured-ingest 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +57 -13
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -23
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +8 -5
- unstructured_ingest/v2/interfaces/file_data.py +8 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/pipeline.py +9 -6
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +13 -11
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -6
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +18 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +46 -39
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +30 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +30 -21
- unstructured_ingest/v2/processes/connectors/couchbase.py +333 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +87 -32
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +70 -45
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +39 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +15 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +10 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +20 -34
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +38 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +31 -17
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +19 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +40 -34
- unstructured_ingest/v2/processes/connectors/kdbai.py +170 -0
- unstructured_ingest/v2/processes/connectors/local.py +27 -16
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +22 -18
- unstructured_ingest/v2/processes/connectors/onedrive.py +17 -14
- unstructured_ingest/v2/processes/connectors/opensearch.py +66 -56
- unstructured_ingest/v2/processes/connectors/pinecone.py +22 -21
- unstructured_ingest/v2/processes/connectors/salesforce.py +26 -18
- unstructured_ingest/v2/processes/connectors/sharepoint.py +51 -26
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +29 -31
- unstructured_ingest/v2/processes/connectors/weaviate.py +22 -13
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +11 -5
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.5.dist-info/LICENSE.md +201 -0
- unstructured_ingest-0.0.5.dist-info/METADATA +574 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/RECORD +91 -116
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -69
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -52
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -13
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/filter.py +0 -28
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.3.dist-info/METADATA +0 -175
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.3.dist-info → unstructured_ingest-0.0.5.dist-info}/top_level.txt +0 -0
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
import io
|
|
2
|
-
import
|
|
2
|
+
import json
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
6
|
|
|
6
7
|
from dateutil import parser
|
|
7
|
-
from
|
|
8
|
+
from pydantic import Field, Secret
|
|
8
9
|
|
|
9
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
10
|
from unstructured_ingest.error import (
|
|
11
11
|
SourceConnectionError,
|
|
12
12
|
SourceConnectionNetworkError,
|
|
13
13
|
)
|
|
14
14
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
15
|
-
from unstructured_ingest.utils.
|
|
15
|
+
from unstructured_ingest.utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES
|
|
16
16
|
from unstructured_ingest.v2.interfaces import (
|
|
17
17
|
AccessConfig,
|
|
18
18
|
ConnectionConfig,
|
|
@@ -37,46 +37,54 @@ if TYPE_CHECKING:
|
|
|
37
37
|
from googleapiclient.http import MediaIoBaseDownload
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
@dataclass
|
|
41
40
|
class GoogleDriveAccessConfig(AccessConfig):
|
|
42
|
-
service_account_key:
|
|
41
|
+
service_account_key: Optional[dict] = Field(
|
|
42
|
+
default=None, description="Credentials values to use for authentication"
|
|
43
|
+
)
|
|
44
|
+
service_account_key_path: Optional[Path] = Field(
|
|
45
|
+
default=None, description="File path to credentials values to use for authentication"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def model_post_init(self, __context: Any) -> None:
|
|
49
|
+
if self.service_account_key is None and self.service_account_key_path is None:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"either service_account_key or service_account_key_path must be provided"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def get_service_account_key(self) -> dict:
|
|
55
|
+
key_data = None
|
|
56
|
+
if self.service_account_key_path:
|
|
57
|
+
with self.service_account_key_path.open() as f:
|
|
58
|
+
key_data = json.load(f)
|
|
59
|
+
if key_data and self.service_account_key:
|
|
60
|
+
if key_data == self.service_account_key:
|
|
61
|
+
return key_data
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError(
|
|
64
|
+
"service_account_key and service_account_key_path "
|
|
65
|
+
"both provided and have different values"
|
|
66
|
+
)
|
|
67
|
+
if key_data:
|
|
68
|
+
return key_data
|
|
69
|
+
return self.service_account_key
|
|
43
70
|
|
|
44
71
|
|
|
45
|
-
@dataclass
|
|
46
72
|
class GoogleDriveConnectionConfig(ConnectionConfig):
|
|
47
|
-
drive_id: str
|
|
48
|
-
access_config: GoogleDriveAccessConfig
|
|
73
|
+
drive_id: str = Field(description="Google Drive File or Folder ID.")
|
|
74
|
+
access_config: Secret[GoogleDriveAccessConfig]
|
|
49
75
|
|
|
50
76
|
@requires_dependencies(["googleapiclient"], extras="google-drive")
|
|
51
77
|
def get_files_service(self) -> "GoogleAPIResource":
|
|
52
|
-
from google.auth import
|
|
78
|
+
from google.auth import exceptions
|
|
53
79
|
from google.oauth2 import service_account
|
|
54
80
|
from googleapiclient.discovery import build
|
|
55
81
|
from googleapiclient.errors import HttpError
|
|
56
82
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
if isinstance(self.access_config.service_account_key, str):
|
|
60
|
-
key_path = json_to_dict(self.access_config.service_account_key)
|
|
61
|
-
elif isinstance(self.access_config.service_account_key, dict):
|
|
62
|
-
key_path = self.access_config.service_account_key
|
|
63
|
-
else:
|
|
64
|
-
raise TypeError(
|
|
65
|
-
f"access_config.service_account_key must be "
|
|
66
|
-
f"str or dict, got: {type(self.access_config.service_account_key)}"
|
|
67
|
-
)
|
|
83
|
+
access_config = self.access_config.get_secret_value()
|
|
84
|
+
key_data = access_config.get_service_account_key()
|
|
68
85
|
|
|
69
86
|
try:
|
|
70
|
-
|
|
71
|
-
creds = service_account.Credentials.from_service_account_info(key_path)
|
|
72
|
-
elif isinstance(key_path, str):
|
|
73
|
-
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path
|
|
74
|
-
creds, _ = default()
|
|
75
|
-
else:
|
|
76
|
-
raise ValueError(
|
|
77
|
-
f"key path not recognized as a dictionary or a file path: "
|
|
78
|
-
f"[{type(key_path)}] {key_path}",
|
|
79
|
-
)
|
|
87
|
+
creds = service_account.Credentials.from_service_account_info(key_data)
|
|
80
88
|
service = build("drive", "v3", credentials=creds)
|
|
81
89
|
return service.files()
|
|
82
90
|
|
|
@@ -86,7 +94,6 @@ class GoogleDriveConnectionConfig(ConnectionConfig):
|
|
|
86
94
|
raise ValueError("The provided API key is invalid.")
|
|
87
95
|
|
|
88
96
|
|
|
89
|
-
@dataclass
|
|
90
97
|
class GoogleDriveIndexerConfig(IndexerConfig):
|
|
91
98
|
extensions: Optional[list[str]] = None
|
|
92
99
|
recursive: bool = False
|
|
@@ -268,7 +275,6 @@ class GoogleDriveIndexer(Indexer):
|
|
|
268
275
|
yield f
|
|
269
276
|
|
|
270
277
|
|
|
271
|
-
@dataclass
|
|
272
278
|
class GoogleDriveDownloaderConfig(DownloaderConfig):
|
|
273
279
|
pass
|
|
274
280
|
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import uuid
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from pydantic import Field, Secret
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
12
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
from unstructured_ingest.v2.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
FileData,
|
|
18
|
+
UploadContent,
|
|
19
|
+
Uploader,
|
|
20
|
+
UploaderConfig,
|
|
21
|
+
UploadStager,
|
|
22
|
+
UploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
26
|
+
DestinationRegistryEntry,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from kdbai_client import Session, Table
|
|
31
|
+
|
|
32
|
+
CONNECTOR_TYPE = "kdbai"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class KdbaiAccessConfig(AccessConfig):
|
|
36
|
+
api_key: Optional[str] = Field(
|
|
37
|
+
default=None,
|
|
38
|
+
description="A string for the api-key, can be left empty "
|
|
39
|
+
"when connecting to local KDBAI instance.",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
SecretKdbaiAccessConfig = Secret[KdbaiAccessConfig]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class KdbaiConnectionConfig(ConnectionConfig):
|
|
47
|
+
access_config: SecretKdbaiAccessConfig = Field(
|
|
48
|
+
default=SecretKdbaiAccessConfig(secret_value=KdbaiAccessConfig())
|
|
49
|
+
)
|
|
50
|
+
endpoint: str = Field(
|
|
51
|
+
default="http://localhost:8082", description="Endpoint url where KDBAI is hosted."
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@requires_dependencies(["kdbai_client"], extras="kdbai")
|
|
55
|
+
def get_session(self) -> "Session":
|
|
56
|
+
from kdbai_client import Session
|
|
57
|
+
|
|
58
|
+
return Session(
|
|
59
|
+
api_key=self.access_config.get_secret_value().api_key, endpoint=self.endpoint
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class KdbaiUploadStagerConfig(UploadStagerConfig):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class KdbaiUploadStager(UploadStager):
|
|
69
|
+
upload_stager_config: KdbaiUploadStagerConfig = field(default_factory=KdbaiUploadStagerConfig)
|
|
70
|
+
|
|
71
|
+
def run(
|
|
72
|
+
self,
|
|
73
|
+
elements_filepath: Path,
|
|
74
|
+
file_data: FileData,
|
|
75
|
+
output_dir: Path,
|
|
76
|
+
output_filename: str,
|
|
77
|
+
**kwargs: Any,
|
|
78
|
+
) -> Path:
|
|
79
|
+
with open(elements_filepath) as elements_file:
|
|
80
|
+
elements_contents = json.load(elements_file)
|
|
81
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
82
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
|
|
84
|
+
data = []
|
|
85
|
+
for element in elements_contents:
|
|
86
|
+
data.append(
|
|
87
|
+
{
|
|
88
|
+
"id": str(uuid.uuid4()),
|
|
89
|
+
"element_id": element.get("element_id"),
|
|
90
|
+
"document": element.pop("text", None),
|
|
91
|
+
"embeddings": element.get("embeddings"),
|
|
92
|
+
"metadata": flatten_dict(
|
|
93
|
+
dictionary=element.get("metadata"),
|
|
94
|
+
flatten_lists=True,
|
|
95
|
+
remove_none=True,
|
|
96
|
+
),
|
|
97
|
+
}
|
|
98
|
+
)
|
|
99
|
+
logger.debug(f"writing {len(data)} elements to {output_path}")
|
|
100
|
+
with output_path.open("w") as output_file:
|
|
101
|
+
json.dump(data, output_file, indent=2)
|
|
102
|
+
return output_path
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class KdbaiUploaderConfig(UploaderConfig):
|
|
106
|
+
table_name: str = Field(description="The name of the KDBAI table to write into.")
|
|
107
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class KdbaiUploader(Uploader):
|
|
112
|
+
connection_config: KdbaiConnectionConfig
|
|
113
|
+
upload_config: KdbaiUploaderConfig
|
|
114
|
+
connector_type: str = field(default=CONNECTOR_TYPE, init=False)
|
|
115
|
+
|
|
116
|
+
def precheck(self) -> None:
|
|
117
|
+
try:
|
|
118
|
+
self.get_table()
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
121
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
122
|
+
|
|
123
|
+
def get_table(self) -> "Table":
|
|
124
|
+
session: Session = self.connection_config.get_session()
|
|
125
|
+
table = session.table(self.upload_config.table_name)
|
|
126
|
+
return table
|
|
127
|
+
|
|
128
|
+
def upsert_batch(self, batch: pd.DataFrame):
|
|
129
|
+
table = self.get_table()
|
|
130
|
+
table.insert(data=batch)
|
|
131
|
+
|
|
132
|
+
def process_dataframe(self, df: pd.DataFrame):
|
|
133
|
+
logger.debug(
|
|
134
|
+
f"uploading {len(df)} entries to {self.connection_config.endpoint} "
|
|
135
|
+
f"db in table {self.upload_config.table_name}"
|
|
136
|
+
)
|
|
137
|
+
for _, batch_df in df.groupby(np.arange(len(df)) // self.upload_config.batch_size):
|
|
138
|
+
self.upsert_batch(batch=batch_df)
|
|
139
|
+
|
|
140
|
+
def process_csv(self, csv_paths: list[Path]):
|
|
141
|
+
logger.debug(f"uploading content from {len(csv_paths)} csv files")
|
|
142
|
+
df = pd.concat((pd.read_csv(path) for path in csv_paths), ignore_index=True)
|
|
143
|
+
self.process_dataframe(df=df)
|
|
144
|
+
|
|
145
|
+
def process_json(self, json_paths: list[Path]):
|
|
146
|
+
logger.debug(f"uploading content from {len(json_paths)} json files")
|
|
147
|
+
all_records = []
|
|
148
|
+
for p in json_paths:
|
|
149
|
+
with open(p) as json_file:
|
|
150
|
+
all_records.extend(json.load(json_file))
|
|
151
|
+
|
|
152
|
+
df = pd.DataFrame(data=all_records)
|
|
153
|
+
self.process_dataframe(df=df)
|
|
154
|
+
|
|
155
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
156
|
+
csv_paths = [c.path for c in contents if c.path.suffix == ".csv"]
|
|
157
|
+
if csv_paths:
|
|
158
|
+
self.process_csv(csv_paths=csv_paths)
|
|
159
|
+
json_paths = [c.path for c in contents if c.path.suffix == ".json"]
|
|
160
|
+
if json_paths:
|
|
161
|
+
self.process_json(json_paths=json_paths)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
kdbai_destination_entry = DestinationRegistryEntry(
|
|
165
|
+
connection_config=KdbaiConnectionConfig,
|
|
166
|
+
uploader=KdbaiUploader,
|
|
167
|
+
uploader_config=KdbaiUploaderConfig,
|
|
168
|
+
upload_stager=KdbaiUploadStager,
|
|
169
|
+
upload_stager_config=KdbaiUploadStagerConfig,
|
|
170
|
+
)
|
|
@@ -5,6 +5,8 @@ from pathlib import Path
|
|
|
5
5
|
from time import time
|
|
6
6
|
from typing import Any, Generator
|
|
7
7
|
|
|
8
|
+
from pydantic import Field, Secret
|
|
9
|
+
|
|
8
10
|
from unstructured_ingest.v2.interfaces import (
|
|
9
11
|
AccessConfig,
|
|
10
12
|
ConnectionConfig,
|
|
@@ -29,20 +31,28 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
29
31
|
CONNECTOR_TYPE = "local"
|
|
30
32
|
|
|
31
33
|
|
|
32
|
-
@dataclass
|
|
33
34
|
class LocalAccessConfig(AccessConfig):
|
|
34
35
|
pass
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
|
|
38
|
+
SecretLocalAccessConfig = Secret[LocalAccessConfig]
|
|
39
|
+
|
|
40
|
+
|
|
38
41
|
class LocalConnectionConfig(ConnectionConfig):
|
|
39
|
-
access_config:
|
|
42
|
+
access_config: SecretLocalAccessConfig = Field(
|
|
43
|
+
default_factory=lambda: SecretLocalAccessConfig(secret_value=LocalAccessConfig())
|
|
44
|
+
)
|
|
40
45
|
|
|
41
46
|
|
|
42
|
-
@dataclass
|
|
43
47
|
class LocalIndexerConfig(IndexerConfig):
|
|
44
|
-
input_path:
|
|
45
|
-
|
|
48
|
+
input_path: Path = Field(
|
|
49
|
+
description="Path to the location in the local file system that will be processed."
|
|
50
|
+
)
|
|
51
|
+
recursive: bool = Field(
|
|
52
|
+
default=False,
|
|
53
|
+
description="Recursively download files in their respective folders "
|
|
54
|
+
"otherwise stop at the files in provided folder level.",
|
|
55
|
+
)
|
|
46
56
|
|
|
47
57
|
@property
|
|
48
58
|
def path(self) -> Path:
|
|
@@ -61,9 +71,12 @@ class LocalIndexer(Indexer):
|
|
|
61
71
|
input_path = self.index_config.path
|
|
62
72
|
if input_path.is_file():
|
|
63
73
|
return [Path(s) for s in glob.glob(f"{self.index_config.path}")]
|
|
74
|
+
files = []
|
|
64
75
|
if self.index_config.recursive:
|
|
65
|
-
|
|
66
|
-
|
|
76
|
+
files.extend(list(input_path.rglob("*")))
|
|
77
|
+
else:
|
|
78
|
+
files.extend(list(input_path.glob("*")))
|
|
79
|
+
return [f for f in files if f.is_file()]
|
|
67
80
|
|
|
68
81
|
def get_file_metadata(self, path: Path) -> FileDataSourceMetadata:
|
|
69
82
|
stats = path.stat()
|
|
@@ -122,7 +135,6 @@ class LocalIndexer(Indexer):
|
|
|
122
135
|
yield file_data
|
|
123
136
|
|
|
124
137
|
|
|
125
|
-
@dataclass
|
|
126
138
|
class LocalDownloaderConfig(DownloaderConfig):
|
|
127
139
|
pass
|
|
128
140
|
|
|
@@ -130,10 +142,8 @@ class LocalDownloaderConfig(DownloaderConfig):
|
|
|
130
142
|
@dataclass
|
|
131
143
|
class LocalDownloader(Downloader):
|
|
132
144
|
connector_type: str = CONNECTOR_TYPE
|
|
133
|
-
connection_config: LocalConnectionConfig = field(
|
|
134
|
-
|
|
135
|
-
)
|
|
136
|
-
download_config: LocalDownloaderConfig = field(default_factory=lambda: LocalDownloaderConfig())
|
|
145
|
+
connection_config: LocalConnectionConfig = field(default_factory=LocalConnectionConfig)
|
|
146
|
+
download_config: LocalDownloaderConfig = field(default_factory=LocalDownloaderConfig)
|
|
137
147
|
|
|
138
148
|
def get_download_path(self, file_data: FileData) -> Path:
|
|
139
149
|
return Path(file_data.source_identifiers.fullpath)
|
|
@@ -144,9 +154,10 @@ class LocalDownloader(Downloader):
|
|
|
144
154
|
)
|
|
145
155
|
|
|
146
156
|
|
|
147
|
-
@dataclass
|
|
148
157
|
class LocalUploaderConfig(UploaderConfig):
|
|
149
|
-
output_dir: str =
|
|
158
|
+
output_dir: str = Field(
|
|
159
|
+
default="structured-output", description="Local path to write partitioned output to"
|
|
160
|
+
)
|
|
150
161
|
|
|
151
162
|
@property
|
|
152
163
|
def output_path(self) -> Path:
|
|
@@ -160,7 +171,7 @@ class LocalUploaderConfig(UploaderConfig):
|
|
|
160
171
|
@dataclass
|
|
161
172
|
class LocalUploader(Uploader):
|
|
162
173
|
connector_type: str = CONNECTOR_TYPE
|
|
163
|
-
upload_config: LocalUploaderConfig = field(default_factory=
|
|
174
|
+
upload_config: LocalUploaderConfig = field(default_factory=LocalUploaderConfig)
|
|
164
175
|
connection_config: LocalConnectionConfig = field(
|
|
165
176
|
default_factory=lambda: LocalConnectionConfig()
|
|
166
177
|
)
|
|
@@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Any, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
from dateutil import parser
|
|
9
|
+
from pydantic import Field, Secret
|
|
9
10
|
|
|
10
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
11
|
from unstructured_ingest.error import WriteError
|
|
12
12
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -32,24 +32,28 @@ if TYPE_CHECKING:
|
|
|
32
32
|
CONNECTOR_TYPE = "milvus"
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
@dataclass
|
|
36
35
|
class MilvusAccessConfig(AccessConfig):
|
|
37
|
-
password: Optional[str] = None
|
|
38
|
-
token: Optional[str] = None
|
|
36
|
+
password: Optional[str] = Field(default=None, description="Milvus password")
|
|
37
|
+
token: Optional[str] = Field(default=None, description="Milvus access token")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
SecretMilvusAccessConfig = Secret[MilvusAccessConfig]
|
|
39
41
|
|
|
40
42
|
|
|
41
|
-
@dataclass
|
|
42
43
|
class MilvusConnectionConfig(ConnectionConfig):
|
|
43
|
-
access_config:
|
|
44
|
-
|
|
44
|
+
access_config: SecretMilvusAccessConfig = Field(
|
|
45
|
+
default_factory=lambda: SecretMilvusAccessConfig(secret_value=MilvusAccessConfig())
|
|
45
46
|
)
|
|
46
|
-
uri: Optional[str] =
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
uri: Optional[str] = Field(
|
|
48
|
+
default=None, description="Milvus uri", examples=["http://localhost:19530"]
|
|
49
|
+
)
|
|
50
|
+
user: Optional[str] = Field(default=None, description="Milvus user")
|
|
51
|
+
db_name: Optional[str] = Field(default=None, description="Milvus database name")
|
|
49
52
|
|
|
50
53
|
def get_connection_kwargs(self) -> dict[str, Any]:
|
|
51
|
-
|
|
52
|
-
|
|
54
|
+
access_config = self.access_config.get_secret_value()
|
|
55
|
+
access_config_dict = access_config.dict()
|
|
56
|
+
connection_config_dict = self.dict()
|
|
53
57
|
connection_config_dict.pop("access_config", None)
|
|
54
58
|
connection_config_dict.update(access_config_dict)
|
|
55
59
|
# Drop any that were not set explicitly
|
|
@@ -63,7 +67,6 @@ class MilvusConnectionConfig(ConnectionConfig):
|
|
|
63
67
|
return MilvusClient(**self.get_connection_kwargs())
|
|
64
68
|
|
|
65
69
|
|
|
66
|
-
@dataclass
|
|
67
70
|
class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
68
71
|
pass
|
|
69
72
|
|
|
@@ -130,10 +133,11 @@ class MilvusUploadStager(UploadStager):
|
|
|
130
133
|
return output_path
|
|
131
134
|
|
|
132
135
|
|
|
133
|
-
@dataclass
|
|
134
136
|
class MilvusUploaderConfig(UploaderConfig):
|
|
135
|
-
collection_name: str
|
|
136
|
-
|
|
137
|
+
collection_name: str = Field(description="Milvus collections to write to")
|
|
138
|
+
num_processes: int = Field(
|
|
139
|
+
default=4, description="number of processes to use when writing to support parallel writes"
|
|
140
|
+
)
|
|
137
141
|
|
|
138
142
|
|
|
139
143
|
@dataclass
|
|
@@ -180,13 +184,13 @@ class MilvusUploader(Uploader):
|
|
|
180
184
|
self.insert_results(data=data)
|
|
181
185
|
|
|
182
186
|
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
183
|
-
if self.upload_config.
|
|
187
|
+
if self.upload_config.num_processes == 1:
|
|
184
188
|
for content in contents:
|
|
185
189
|
self.upload(content=content)
|
|
186
190
|
|
|
187
191
|
else:
|
|
188
192
|
with mp.Pool(
|
|
189
|
-
processes=self.upload_config.
|
|
193
|
+
processes=self.upload_config.num_processes,
|
|
190
194
|
) as pool:
|
|
191
195
|
pool.map(self.upload, contents)
|
|
192
196
|
|
|
@@ -3,9 +3,9 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
5
5
|
|
|
6
|
-
from
|
|
6
|
+
from pydantic import Field, Secret
|
|
7
7
|
|
|
8
|
-
from unstructured_ingest.
|
|
8
|
+
from unstructured_ingest.__version__ import __version__ as unstructured_version
|
|
9
9
|
from unstructured_ingest.error import DestinationConnectionError
|
|
10
10
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
@@ -31,25 +31,28 @@ CONNECTOR_TYPE = "mongodb"
|
|
|
31
31
|
SERVER_API_VERSION = "1"
|
|
32
32
|
|
|
33
33
|
|
|
34
|
-
@dataclass
|
|
35
34
|
class MongoDBAccessConfig(AccessConfig):
|
|
36
|
-
uri: Optional[str] = None
|
|
35
|
+
uri: Optional[str] = Field(default=None, description="URI to user when connecting")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
SecretMongoDBAccessConfig = Secret[MongoDBAccessConfig]
|
|
37
39
|
|
|
38
40
|
|
|
39
|
-
@dataclass
|
|
40
41
|
class MongoDBConnectionConfig(ConnectionConfig):
|
|
41
|
-
access_config:
|
|
42
|
-
|
|
42
|
+
access_config: SecretMongoDBAccessConfig = Field(
|
|
43
|
+
default_factory=lambda: SecretMongoDBAccessConfig(secret_value=MongoDBAccessConfig())
|
|
43
44
|
)
|
|
44
|
-
host: Optional[str] =
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
45
|
+
host: Optional[str] = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="hostname or IP address or Unix domain socket path of a single mongod or "
|
|
48
|
+
"mongos instance to connect to, or a list of hostnames",
|
|
49
|
+
)
|
|
50
|
+
database: Optional[str] = Field(default=None, description="database name to connect to")
|
|
51
|
+
collection: Optional[str] = Field(default=None, description="collection name to connect to")
|
|
52
|
+
port: int = Field(default=27017)
|
|
53
|
+
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
50
54
|
|
|
51
55
|
|
|
52
|
-
@dataclass
|
|
53
56
|
class MongoDBUploadStagerConfig(UploadStagerConfig):
|
|
54
57
|
pass
|
|
55
58
|
|
|
@@ -77,9 +80,8 @@ class MongoDBUploadStager(UploadStager):
|
|
|
77
80
|
return output_path
|
|
78
81
|
|
|
79
82
|
|
|
80
|
-
@dataclass
|
|
81
83
|
class MongoDBUploaderConfig(UploaderConfig):
|
|
82
|
-
batch_size: int = 100
|
|
84
|
+
batch_size: int = Field(default=100, description="Number of records per batch")
|
|
83
85
|
|
|
84
86
|
|
|
85
87
|
@dataclass
|
|
@@ -102,9 +104,11 @@ class MongoDBUploader(Uploader):
|
|
|
102
104
|
from pymongo.driver_info import DriverInfo
|
|
103
105
|
from pymongo.server_api import ServerApi
|
|
104
106
|
|
|
105
|
-
|
|
107
|
+
access_config = self.connection_config.access_config.get_secret_value()
|
|
108
|
+
|
|
109
|
+
if access_config.uri:
|
|
106
110
|
return MongoClient(
|
|
107
|
-
|
|
111
|
+
access_config.uri,
|
|
108
112
|
server_api=ServerApi(version=SERVER_API_VERSION),
|
|
109
113
|
driver=DriverInfo(name="unstructured", version=unstructured_version),
|
|
110
114
|
)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import json
|
|
2
|
-
from dataclasses import dataclass
|
|
2
|
+
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from time import time
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
|
|
7
7
|
from dateutil import parser
|
|
8
|
+
from pydantic import Field, Secret
|
|
8
9
|
|
|
9
|
-
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
10
|
from unstructured_ingest.error import SourceConnectionError, SourceConnectionNetworkError
|
|
11
11
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
12
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -35,18 +35,23 @@ CONNECTOR_TYPE = "onedrive"
|
|
|
35
35
|
MAX_MB_SIZE = 512_000_000
|
|
36
36
|
|
|
37
37
|
|
|
38
|
-
@dataclass
|
|
39
38
|
class OnedriveAccessConfig(AccessConfig):
|
|
40
|
-
client_cred: str
|
|
39
|
+
client_cred: str = Field(description="Microsoft App client secret")
|
|
41
40
|
|
|
42
41
|
|
|
43
|
-
@dataclass
|
|
44
42
|
class OnedriveConnectionConfig(ConnectionConfig):
|
|
45
|
-
client_id: str
|
|
46
|
-
user_pname: str
|
|
47
|
-
tenant: str =
|
|
48
|
-
|
|
49
|
-
|
|
43
|
+
client_id: str = Field(description="Microsoft app client ID")
|
|
44
|
+
user_pname: str = Field(description="User principal name, usually is your Azure AD email.")
|
|
45
|
+
tenant: str = Field(
|
|
46
|
+
repr=False, description="ID or domain name associated with your Azure AD instance"
|
|
47
|
+
)
|
|
48
|
+
authority_url: Optional[str] = Field(
|
|
49
|
+
repr=False,
|
|
50
|
+
default="https://login.microsoftonline.com",
|
|
51
|
+
examples=["https://login.microsoftonline.com"],
|
|
52
|
+
description="Authentication token provider for Microsoft apps",
|
|
53
|
+
)
|
|
54
|
+
access_config: Secret[OnedriveAccessConfig]
|
|
50
55
|
|
|
51
56
|
@requires_dependencies(["msal"], extras="onedrive")
|
|
52
57
|
def get_token(self):
|
|
@@ -56,7 +61,7 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
56
61
|
app = ConfidentialClientApplication(
|
|
57
62
|
authority=f"{self.authority_url}/{self.tenant}",
|
|
58
63
|
client_id=self.client_id,
|
|
59
|
-
client_credential=self.access_config.client_cred,
|
|
64
|
+
client_credential=self.access_config.get_secret_value().client_cred,
|
|
60
65
|
)
|
|
61
66
|
token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
|
62
67
|
except ValueError as exc:
|
|
@@ -76,9 +81,8 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
76
81
|
return client
|
|
77
82
|
|
|
78
83
|
|
|
79
|
-
@dataclass
|
|
80
84
|
class OnedriveIndexerConfig(IndexerConfig):
|
|
81
|
-
path: Optional[str] =
|
|
85
|
+
path: Optional[str] = Field(default="")
|
|
82
86
|
recursive: bool = False
|
|
83
87
|
|
|
84
88
|
|
|
@@ -171,7 +175,6 @@ class OnedriveIndexer(Indexer):
|
|
|
171
175
|
yield file_data
|
|
172
176
|
|
|
173
177
|
|
|
174
|
-
@dataclass
|
|
175
178
|
class OnedriveDownloaderConfig(DownloaderConfig):
|
|
176
179
|
pass
|
|
177
180
|
|