unstructured-ingest 0.0.14__py3-none-any.whl → 0.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/cli/utils.py +1 -1
- unstructured_ingest/connector/astradb.py +1 -1
- unstructured_ingest/connector/biomed.py +4 -4
- unstructured_ingest/connector/chroma.py +1 -1
- unstructured_ingest/connector/databricks_volumes.py +2 -2
- unstructured_ingest/connector/fsspec/box.py +1 -1
- unstructured_ingest/connector/fsspec/fsspec.py +5 -5
- unstructured_ingest/connector/git.py +1 -1
- unstructured_ingest/connector/google_drive.py +4 -4
- unstructured_ingest/connector/hubspot.py +1 -1
- unstructured_ingest/connector/kafka.py +8 -8
- unstructured_ingest/connector/local.py +1 -1
- unstructured_ingest/connector/notion/helpers.py +4 -4
- unstructured_ingest/connector/onedrive.py +3 -3
- unstructured_ingest/connector/outlook.py +2 -2
- unstructured_ingest/connector/pinecone.py +1 -1
- unstructured_ingest/connector/sharepoint.py +8 -8
- unstructured_ingest/connector/vectara.py +6 -6
- unstructured_ingest/interfaces.py +4 -4
- unstructured_ingest/logger.py +1 -1
- unstructured_ingest/pipeline/copy.py +1 -1
- unstructured_ingest/pipeline/interfaces.py +2 -2
- unstructured_ingest/pipeline/partition.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/pipeline/reformat/chunking.py +2 -2
- unstructured_ingest/pipeline/reformat/embedding.py +1 -1
- unstructured_ingest/pipeline/source.py +2 -2
- unstructured_ingest/utils/compression.py +3 -3
- unstructured_ingest/utils/string_and_date_utils.py +2 -2
- unstructured_ingest/v2/cli/base/cmd.py +3 -3
- unstructured_ingest/v2/cli/base/dest.py +1 -1
- unstructured_ingest/v2/cli/base/src.py +1 -1
- unstructured_ingest/v2/cli/utils/click.py +1 -1
- unstructured_ingest/v2/interfaces/processor.py +48 -13
- unstructured_ingest/v2/logger.py +1 -1
- unstructured_ingest/v2/otel.py +1 -1
- unstructured_ingest/v2/pipeline/interfaces.py +9 -2
- unstructured_ingest/v2/pipeline/pipeline.py +17 -6
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
- unstructured_ingest/v2/pipeline/steps/download.py +17 -2
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +2 -2
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
- unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
- unstructured_ingest/v2/processes/connectors/google_drive.py +1 -1
- unstructured_ingest/v2/processes/connectors/local.py +6 -5
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/onedrive.py +2 -2
- unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +2 -2
- unstructured_ingest/v2/processes/connectors/sharepoint.py +9 -5
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/partitioner.py +3 -3
- unstructured_ingest/v2/utils.py +7 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/METADATA +272 -274
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/RECORD +69 -69
- unstructured_ingest/evaluate.py +0 -338
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.14.dist-info → unstructured_ingest-0.0.16.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
5
|
+
|
|
6
|
+
import pandas
|
|
7
|
+
from pydantic import BaseModel, Field, Secret, field_validator
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
+
from unstructured_ingest.v2.interfaces import (
|
|
11
|
+
AccessConfig,
|
|
12
|
+
ConnectionConfig,
|
|
13
|
+
Downloader,
|
|
14
|
+
DownloaderConfig,
|
|
15
|
+
FileData,
|
|
16
|
+
Indexer,
|
|
17
|
+
IndexerConfig,
|
|
18
|
+
SourceIdentifiers,
|
|
19
|
+
download_responses,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
|
+
SourceRegistryEntry,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from pyairtable import Api
|
|
27
|
+
from pyairtable.api.types import RecordDict
|
|
28
|
+
|
|
29
|
+
CONNECTOR_TYPE = "airtable"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AirtableTableMeta(BaseModel):
|
|
33
|
+
"""Metadata specifying a table id, a base id which the table is stored in,
|
|
34
|
+
and an t.Optional view id in case particular rows and fields are to be ingested"""
|
|
35
|
+
|
|
36
|
+
base_id: str
|
|
37
|
+
table_id: str
|
|
38
|
+
view_id: Optional[str] = None
|
|
39
|
+
|
|
40
|
+
def get_id(self) -> str:
|
|
41
|
+
id_s = f"{self.base_id}{self.table_id}"
|
|
42
|
+
id_s = f"{id_s}{self.view_id}" if self.view_id else id_s
|
|
43
|
+
return str(uuid5(NAMESPACE_DNS, id_s))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class AirtableAccessConfig(AccessConfig):
|
|
47
|
+
personal_access_token: str = Field(
|
|
48
|
+
description="Personal access token to authenticate into Airtable. Check: "
|
|
49
|
+
"https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
|
|
50
|
+
"for more info"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AirtableConnectionConfig(ConnectionConfig):
|
|
55
|
+
access_config: Secret[AirtableAccessConfig]
|
|
56
|
+
|
|
57
|
+
@requires_dependencies(["pyairtable"], extras="airtable")
|
|
58
|
+
def get_client(self) -> "Api":
|
|
59
|
+
from pyairtable import Api
|
|
60
|
+
|
|
61
|
+
access_config = self.access_config.get_secret_value()
|
|
62
|
+
return Api(api_key=access_config.personal_access_token)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class AirtableIndexerConfig(IndexerConfig):
|
|
66
|
+
list_of_paths: Optional[list[str]] = Field(
|
|
67
|
+
default=None,
|
|
68
|
+
description="""
|
|
69
|
+
A list of paths that specify the locations to ingest data from within Airtable.
|
|
70
|
+
|
|
71
|
+
If this argument is not set, the connector ingests all tables within each and every base.
|
|
72
|
+
--list-of-paths: path1 path2 path3 ….
|
|
73
|
+
path: base_id/table_id(optional)/view_id(optional)/
|
|
74
|
+
|
|
75
|
+
To obtain (base, table, view) ids in bulk, check:
|
|
76
|
+
https://airtable.com/developers/web/api/list-bases (base ids)
|
|
77
|
+
https://airtable.com/developers/web/api/get-base-schema (table and view ids)
|
|
78
|
+
https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
|
|
79
|
+
|
|
80
|
+
To obtain specific ids from Airtable UI, go to your workspace, and copy any
|
|
81
|
+
relevant id from the URL structure:
|
|
82
|
+
https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
|
|
83
|
+
appAbcDeF1ghijKlm -> base_id
|
|
84
|
+
tblABcdEfG1HIJkLm -> table_id
|
|
85
|
+
viwABCDEfg6hijKLM -> view_id
|
|
86
|
+
|
|
87
|
+
You can also check: https://support.airtable.com/docs/finding-airtable-ids
|
|
88
|
+
|
|
89
|
+
Here is an example for one --list-of-paths:
|
|
90
|
+
base1/ → gets the entirety of all tables inside base1
|
|
91
|
+
base1/table1 → gets all rows and columns within table1 in base1
|
|
92
|
+
base1/table1/view1 → gets the rows and columns that are
|
|
93
|
+
visible in view1 for the table1 in base1
|
|
94
|
+
|
|
95
|
+
Examples to invalid airtable_paths:
|
|
96
|
+
table1 → has to mention base to be valid
|
|
97
|
+
base1/view1 → has to mention table to be valid
|
|
98
|
+
""",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def validate_path(cls, path: str):
|
|
103
|
+
components = path.split("/")
|
|
104
|
+
if len(components) > 3:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"Path must be of the format: base_id/table_id/view_id, "
|
|
107
|
+
f"where table id and view id are optional. Got: {path}"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
@field_validator("list_of_paths")
|
|
111
|
+
@classmethod
|
|
112
|
+
def validate_format(cls, v: list[str]) -> list[str]:
|
|
113
|
+
for path in v:
|
|
114
|
+
cls.validate_path(path=path)
|
|
115
|
+
return v
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class AirtableIndexer(Indexer):
|
|
120
|
+
connector_type: str = CONNECTOR_TYPE
|
|
121
|
+
connection_config: AirtableConnectionConfig
|
|
122
|
+
index_config: AirtableIndexerConfig
|
|
123
|
+
|
|
124
|
+
def get_all_table_meta(self) -> list[AirtableTableMeta]:
|
|
125
|
+
client = self.connection_config.get_client()
|
|
126
|
+
bases = client.bases()
|
|
127
|
+
airtable_meta = []
|
|
128
|
+
for base in bases:
|
|
129
|
+
for table in base.schema().tables:
|
|
130
|
+
airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
|
|
131
|
+
return airtable_meta
|
|
132
|
+
|
|
133
|
+
def get_base_tables_meta(self, base_id: str) -> list[AirtableTableMeta]:
|
|
134
|
+
client = self.connection_config.get_client()
|
|
135
|
+
base = client.base(base_id=base_id)
|
|
136
|
+
airtable_meta = []
|
|
137
|
+
for table in base.tables():
|
|
138
|
+
airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
|
|
139
|
+
return airtable_meta
|
|
140
|
+
|
|
141
|
+
def get_meta_from_list(self) -> list[AirtableTableMeta]:
|
|
142
|
+
airtable_meta = []
|
|
143
|
+
for path in self.index_config.list_of_paths:
|
|
144
|
+
components = path.split("/")
|
|
145
|
+
if len(components) == 1:
|
|
146
|
+
airtable_meta.extend(self.get_base_tables_meta(base_id=components[0]))
|
|
147
|
+
elif len(components) == 2:
|
|
148
|
+
airtable_meta.append(
|
|
149
|
+
AirtableTableMeta(base_id=components[0], table_id=components[1])
|
|
150
|
+
)
|
|
151
|
+
elif len(components) == 3:
|
|
152
|
+
airtable_meta.append(
|
|
153
|
+
AirtableTableMeta(
|
|
154
|
+
base_id=components[0], table_id=components[1], view_id=components[2]
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
else:
|
|
158
|
+
raise ValueError(
|
|
159
|
+
f"Path must be of the format: base_id/table_id/view_id, "
|
|
160
|
+
f"where table id and view id are optional. Got: {path}"
|
|
161
|
+
)
|
|
162
|
+
return airtable_meta
|
|
163
|
+
|
|
164
|
+
def get_table_metas(self) -> list[AirtableTableMeta]:
|
|
165
|
+
if not self.index_config.list_of_paths:
|
|
166
|
+
return self.get_all_table_meta()
|
|
167
|
+
return self.get_meta_from_list()
|
|
168
|
+
|
|
169
|
+
def precheck(self) -> None:
|
|
170
|
+
client = self.connection_config.get_client()
|
|
171
|
+
client.request(method="HEAD", url=client.build_url("meta", "bases"))
|
|
172
|
+
|
|
173
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
174
|
+
table_metas = self.get_table_metas()
|
|
175
|
+
for table_meta in table_metas:
|
|
176
|
+
fullpath = (
|
|
177
|
+
f"{table_meta.base_id}/{table_meta.table_id}/{table_meta.view_id}.csv"
|
|
178
|
+
if table_meta.view_id
|
|
179
|
+
else f"{table_meta.base_id}/{table_meta.table_id}.csv"
|
|
180
|
+
)
|
|
181
|
+
yield FileData(
|
|
182
|
+
identifier=table_meta.get_id(),
|
|
183
|
+
connector_type=CONNECTOR_TYPE,
|
|
184
|
+
additional_metadata=table_meta.dict(),
|
|
185
|
+
source_identifiers=SourceIdentifiers(
|
|
186
|
+
filename=str(Path(fullpath).name),
|
|
187
|
+
fullpath=fullpath,
|
|
188
|
+
),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class AirtableDownloaderConfig(DownloaderConfig):
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@dataclass
|
|
197
|
+
class AirtableDownloader(Downloader):
|
|
198
|
+
connection_config: AirtableConnectionConfig
|
|
199
|
+
download_config: AirtableDownloaderConfig = field(default_factory=AirtableDownloaderConfig)
|
|
200
|
+
connector_type: str = CONNECTOR_TYPE
|
|
201
|
+
|
|
202
|
+
def get_table_contents(self, table_meta: AirtableTableMeta) -> list["RecordDict"]:
|
|
203
|
+
client = self.connection_config.get_client()
|
|
204
|
+
table = client.table(base_id=table_meta.base_id, table_name=table_meta.table_id)
|
|
205
|
+
table_fetch_kwargs = {"view": table_meta.view_id} if table_meta.view_id else {}
|
|
206
|
+
rows = table.all(**table_fetch_kwargs)
|
|
207
|
+
return rows
|
|
208
|
+
|
|
209
|
+
def _table_row_to_dict(self, table_row: "RecordDict") -> dict:
|
|
210
|
+
row_dict = {
|
|
211
|
+
"id": table_row["id"],
|
|
212
|
+
"created_time": table_row["createdTime"],
|
|
213
|
+
}
|
|
214
|
+
row_dict.update(table_row["fields"])
|
|
215
|
+
return row_dict
|
|
216
|
+
|
|
217
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
218
|
+
table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
|
|
219
|
+
table_contents = self.get_table_contents(table_meta=table_meta)
|
|
220
|
+
df = pandas.DataFrame.from_dict(
|
|
221
|
+
data=[self._table_row_to_dict(table_row=row) for row in table_contents]
|
|
222
|
+
).sort_index(axis=1)
|
|
223
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
224
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
225
|
+
df.to_csv(path_or_buf=download_path)
|
|
226
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
airtable_source_entry = SourceRegistryEntry(
|
|
230
|
+
indexer=AirtableIndexer,
|
|
231
|
+
indexer_config=AirtableIndexerConfig,
|
|
232
|
+
downloader=AirtableDownloader,
|
|
233
|
+
downloader_config=AirtableDownloaderConfig,
|
|
234
|
+
connection_config=AirtableConnectionConfig,
|
|
235
|
+
)
|
|
@@ -104,7 +104,7 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
|
|
|
104
104
|
elif access_config.es_api_key:
|
|
105
105
|
client_input_kwargs["api_key"] = access_config.es_api_key
|
|
106
106
|
client_input = ElasticsearchClientInput(**client_input_kwargs)
|
|
107
|
-
logger.debug(f"
|
|
107
|
+
logger.debug(f"elasticsearch client inputs mapped to: {client_input.dict()}")
|
|
108
108
|
client_kwargs = client_input.dict()
|
|
109
109
|
client_kwargs["basic_auth"] = (
|
|
110
110
|
client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
|
|
@@ -47,7 +47,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
|
|
|
47
47
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
48
48
|
|
|
49
49
|
def get_access_config(self) -> dict[str, Any]:
|
|
50
|
-
# Return access_kwargs with oauth. The oauth object
|
|
50
|
+
# Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
|
|
51
51
|
# because it is not serializable.
|
|
52
52
|
from boxsdk import JWTAuth
|
|
53
53
|
|
|
@@ -317,9 +317,9 @@ class FsspecUploader(Uploader):
|
|
|
317
317
|
path_str = str(path.resolve())
|
|
318
318
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
319
319
|
if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
|
|
320
|
-
logger.debug(f"
|
|
320
|
+
logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
|
|
321
321
|
return
|
|
322
|
-
logger.debug(f"
|
|
322
|
+
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
323
323
|
self.fs.upload(lpath=path_str, rpath=str(upload_path))
|
|
324
324
|
|
|
325
325
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -328,7 +328,7 @@ class FsspecUploader(Uploader):
|
|
|
328
328
|
# Odd that fsspec doesn't run exists() as async even when client support async
|
|
329
329
|
already_exists = self.fs.exists(path=str(upload_path))
|
|
330
330
|
if already_exists and not self.upload_config.overwrite:
|
|
331
|
-
logger.debug(f"
|
|
331
|
+
logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
|
|
332
332
|
return
|
|
333
|
-
logger.debug(f"
|
|
333
|
+
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
334
334
|
self.fs.upload(lpath=path_str, rpath=str(upload_path))
|
|
@@ -199,7 +199,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
199
199
|
if extensions:
|
|
200
200
|
ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
|
|
201
201
|
q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
|
|
202
|
-
logger.debug(f"
|
|
202
|
+
logger.debug(f"query used when indexing: {q}")
|
|
203
203
|
logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
|
|
204
204
|
done = False
|
|
205
205
|
page_token = None
|
|
@@ -180,14 +180,15 @@ class LocalUploader(Uploader):
|
|
|
180
180
|
|
|
181
181
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
182
182
|
if source_identifiers := file_data.source_identifiers:
|
|
183
|
-
identifiers = source_identifiers
|
|
184
183
|
rel_path = (
|
|
185
|
-
|
|
186
|
-
if
|
|
187
|
-
else
|
|
184
|
+
source_identifiers.relative_path[1:]
|
|
185
|
+
if source_identifiers.relative_path.startswith("/")
|
|
186
|
+
else source_identifiers.relative_path
|
|
188
187
|
)
|
|
189
188
|
new_path = self.upload_config.output_path / Path(rel_path)
|
|
190
|
-
final_path = str(new_path).replace(
|
|
189
|
+
final_path = str(new_path).replace(
|
|
190
|
+
source_identifiers.filename, f"{source_identifiers.filename}.json"
|
|
191
|
+
)
|
|
191
192
|
else:
|
|
192
193
|
final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
|
|
193
194
|
Path(final_path).parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -71,7 +71,7 @@ class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
|
71
71
|
fields_to_include: Optional[list[str]] = None
|
|
72
72
|
"""If set - list of fields to include in the output.
|
|
73
73
|
Unspecified fields are removed from the elements.
|
|
74
|
-
This action
|
|
74
|
+
This action takes place after metadata flattening.
|
|
75
75
|
Missing fields will cause stager to throw KeyError."""
|
|
76
76
|
|
|
77
77
|
flatten_metadata: bool = True
|
|
@@ -213,9 +213,9 @@ class OnedriveDownloader(Downloader):
|
|
|
213
213
|
fsize = file.get_property("size", 0)
|
|
214
214
|
download_path = self.get_download_path(file_data=file_data)
|
|
215
215
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
216
|
-
logger.info(f"
|
|
216
|
+
logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
|
|
217
217
|
if fsize > MAX_MB_SIZE:
|
|
218
|
-
logger.info(f"
|
|
218
|
+
logger.info(f"downloading file with size: {fsize} bytes in chunks")
|
|
219
219
|
with download_path.open(mode="wb") as f:
|
|
220
220
|
file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
|
|
221
221
|
else:
|
|
@@ -101,7 +101,7 @@ class OpenSearchConnectionConfig(ConnectionConfig):
|
|
|
101
101
|
if self.username and access_config.password:
|
|
102
102
|
client_input_kwargs["http_auth"] = (self.username, access_config.password)
|
|
103
103
|
client_input = OpenSearchClientInput(**client_input_kwargs)
|
|
104
|
-
logger.debug(f"
|
|
104
|
+
logger.debug(f"opensearch client inputs mapped to: {client_input.dict()}")
|
|
105
105
|
client_kwargs = client_input.dict()
|
|
106
106
|
if client_input.http_auth is not None:
|
|
107
107
|
client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
|
|
@@ -57,7 +57,7 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
57
57
|
)
|
|
58
58
|
|
|
59
59
|
index = pc.Index(name=self.index_name, **index_kwargs)
|
|
60
|
-
logger.debug(f"
|
|
60
|
+
logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
|
|
61
61
|
return index
|
|
62
62
|
|
|
63
63
|
|
|
@@ -166,7 +166,7 @@ class PineconeUploader(Uploader):
|
|
|
166
166
|
max_batch_size=self.upload_config.batch_size,
|
|
167
167
|
)
|
|
168
168
|
)
|
|
169
|
-
logger.info(f"
|
|
169
|
+
logger.info(f"split doc with {len(elements_dict)} elements into {len(chunks)} batches")
|
|
170
170
|
|
|
171
171
|
max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
|
|
172
172
|
if self.upload_config.pool_threads:
|
|
@@ -60,13 +60,16 @@ class SharepointAccessConfig(AccessConfig):
|
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
class SharepointPermissionsConfig(BaseModel):
|
|
63
|
-
permissions_application_id: str = Field(
|
|
64
|
-
|
|
63
|
+
permissions_application_id: Optional[str] = Field(
|
|
64
|
+
default=None, description="Microsoft Graph API application id"
|
|
65
|
+
)
|
|
66
|
+
permissions_tenant: Optional[str] = Field(
|
|
67
|
+
default=None,
|
|
65
68
|
description="url to get permissions data within tenant.",
|
|
66
69
|
examples=["https://contoso.onmicrosoft.com"],
|
|
67
70
|
)
|
|
68
|
-
permissions_client_cred: SecretStr = Field(
|
|
69
|
-
description="Microsoft Graph API application credentials"
|
|
71
|
+
permissions_client_cred: Optional[SecretStr] = Field(
|
|
72
|
+
default=None, description="Microsoft Graph API application credentials"
|
|
70
73
|
)
|
|
71
74
|
authority_url: Optional[SecretStr] = Field(
|
|
72
75
|
repr=False,
|
|
@@ -335,7 +338,8 @@ class SharepointIndexer(Indexer):
|
|
|
335
338
|
@property
|
|
336
339
|
def process_permissions(self) -> bool:
|
|
337
340
|
return (
|
|
338
|
-
self.connection_config.permissions_config
|
|
341
|
+
self.connection_config.permissions_config is not None
|
|
342
|
+
and self.connection_config.permissions_config.permissions_tenant
|
|
339
343
|
and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
|
|
340
344
|
and self.connection_config.permissions_config.permissions_application_id
|
|
341
345
|
)
|
|
@@ -47,7 +47,7 @@ class Filterer(BaseProcess, ABC):
|
|
|
47
47
|
for pattern in patterns:
|
|
48
48
|
if fnmatch.filter([path], pattern):
|
|
49
49
|
return True
|
|
50
|
-
logger.debug(f"
|
|
50
|
+
logger.debug(f"the file {path!r} is discarded as it does not match any given glob.")
|
|
51
51
|
return False
|
|
52
52
|
|
|
53
53
|
def run(self, file_data: FileData, **kwargs: Any) -> Optional[FileData]:
|
|
@@ -145,7 +145,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
145
145
|
class FileDataSourceMetadata(DataSourceMetadata):
|
|
146
146
|
filesize_bytes: Optional[int] = None
|
|
147
147
|
|
|
148
|
-
logger.debug(f"
|
|
148
|
+
logger.debug(f"using local partition with kwargs: {self.config.to_partition_kwargs()}")
|
|
149
149
|
logger.debug(f"partitioning file {filename} with metadata {metadata}")
|
|
150
150
|
elements = partition(
|
|
151
151
|
filename=str(filename.resolve()),
|
|
@@ -165,7 +165,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
165
165
|
|
|
166
166
|
partition_request = self.config.to_partition_kwargs()
|
|
167
167
|
|
|
168
|
-
#
|
|
168
|
+
# NOTE(austin): PartitionParameters is a Pydantic model in v0.26.0
|
|
169
169
|
# Prior to this it was a dataclass which doesn't have .__fields
|
|
170
170
|
try:
|
|
171
171
|
possible_fields = PartitionParameters.__fields__
|
|
@@ -182,7 +182,7 @@ class Partitioner(BaseProcess, ABC):
|
|
|
182
182
|
", ".join([v for v in partition_request if v not in filtered_partition_request])
|
|
183
183
|
)
|
|
184
184
|
)
|
|
185
|
-
logger.debug(f"
|
|
185
|
+
logger.debug(f"using hosted partitioner with kwargs: {partition_request}")
|
|
186
186
|
with open(filename, "rb") as f:
|
|
187
187
|
files = Files(
|
|
188
188
|
content=f.read(),
|
unstructured_ingest/v2/utils.py
CHANGED
|
@@ -20,6 +20,11 @@ def is_secret(value: Any) -> bool:
|
|
|
20
20
|
def serialize_base_model(model: BaseModel) -> dict:
|
|
21
21
|
# To get the full serialized dict regardless of if values are marked as Secret
|
|
22
22
|
model_dict = model.dict()
|
|
23
|
+
return serialize_base_dict(model_dict=model_dict)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def serialize_base_dict(model_dict: dict) -> dict:
|
|
27
|
+
model_dict = model_dict.copy()
|
|
23
28
|
for k, v in model_dict.items():
|
|
24
29
|
if isinstance(v, _SecretBase):
|
|
25
30
|
secret_value = v.get_secret_value()
|
|
@@ -27,6 +32,8 @@ def serialize_base_model(model: BaseModel) -> dict:
|
|
|
27
32
|
model_dict[k] = serialize_base_model(model=secret_value)
|
|
28
33
|
else:
|
|
29
34
|
model_dict[k] = secret_value
|
|
35
|
+
if isinstance(v, dict):
|
|
36
|
+
model_dict[k] = serialize_base_dict(model_dict=v)
|
|
30
37
|
|
|
31
38
|
return model_dict
|
|
32
39
|
|