unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/cli/utils.py +1 -1
- unstructured_ingest/connector/astradb.py +1 -1
- unstructured_ingest/connector/biomed.py +4 -4
- unstructured_ingest/connector/chroma.py +1 -1
- unstructured_ingest/connector/databricks_volumes.py +2 -2
- unstructured_ingest/connector/fsspec/box.py +1 -1
- unstructured_ingest/connector/fsspec/fsspec.py +5 -5
- unstructured_ingest/connector/git.py +1 -1
- unstructured_ingest/connector/google_drive.py +4 -4
- unstructured_ingest/connector/hubspot.py +1 -1
- unstructured_ingest/connector/kafka.py +8 -8
- unstructured_ingest/connector/local.py +1 -1
- unstructured_ingest/connector/notion/helpers.py +4 -4
- unstructured_ingest/connector/onedrive.py +3 -3
- unstructured_ingest/connector/outlook.py +2 -2
- unstructured_ingest/connector/pinecone.py +1 -1
- unstructured_ingest/connector/sharepoint.py +8 -8
- unstructured_ingest/connector/vectara.py +6 -6
- unstructured_ingest/embed/__init__.py +17 -0
- unstructured_ingest/embed/bedrock.py +70 -0
- unstructured_ingest/embed/huggingface.py +73 -0
- unstructured_ingest/embed/interfaces.py +36 -0
- unstructured_ingest/embed/mixedbreadai.py +177 -0
- unstructured_ingest/embed/octoai.py +63 -0
- unstructured_ingest/embed/openai.py +61 -0
- unstructured_ingest/embed/vertexai.py +88 -0
- unstructured_ingest/embed/voyageai.py +69 -0
- unstructured_ingest/interfaces.py +21 -11
- unstructured_ingest/logger.py +1 -1
- unstructured_ingest/pipeline/copy.py +1 -1
- unstructured_ingest/pipeline/interfaces.py +2 -2
- unstructured_ingest/pipeline/partition.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/pipeline/reformat/chunking.py +2 -2
- unstructured_ingest/pipeline/reformat/embedding.py +4 -6
- unstructured_ingest/pipeline/source.py +2 -2
- unstructured_ingest/utils/compression.py +3 -3
- unstructured_ingest/utils/data_prep.py +20 -12
- unstructured_ingest/utils/string_and_date_utils.py +2 -2
- unstructured_ingest/v2/cli/base/cmd.py +3 -3
- unstructured_ingest/v2/cli/base/dest.py +1 -1
- unstructured_ingest/v2/cli/base/src.py +3 -2
- unstructured_ingest/v2/cli/utils/click.py +1 -1
- unstructured_ingest/v2/interfaces/processor.py +48 -13
- unstructured_ingest/v2/logger.py +1 -1
- unstructured_ingest/v2/otel.py +1 -1
- unstructured_ingest/v2/pipeline/interfaces.py +12 -3
- unstructured_ingest/v2/pipeline/pipeline.py +42 -29
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
- unstructured_ingest/v2/pipeline/steps/download.py +17 -2
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +2 -2
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
- unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
- unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
- unstructured_ingest/v2/processes/connectors/local.py +6 -5
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
- unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
- unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
- unstructured_ingest/v2/processes/embedder.py +41 -24
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/partitioner.py +3 -3
- unstructured_ingest/v2/utils.py +7 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
- unstructured_ingest/evaluate.py +0 -338
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0
|
@@ -31,7 +31,7 @@ class UploadStageStep(PipelineStep):
|
|
|
31
31
|
self.process.upload_stager_config.json() if self.process.upload_stager_config else None
|
|
32
32
|
)
|
|
33
33
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
34
|
-
logger.info(f"
|
|
34
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
35
35
|
|
|
36
36
|
async def _run_async(
|
|
37
37
|
self, fn: Callable, path: str, file_data_path: str
|
|
@@ -23,7 +23,7 @@ class UncompressStep(PipelineStep):
|
|
|
23
23
|
|
|
24
24
|
def __post_init__(self):
|
|
25
25
|
config = self.process.config.json() if self.process.config else None
|
|
26
|
-
logger.info(f"
|
|
26
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
27
27
|
|
|
28
28
|
async def _run_async(
|
|
29
29
|
self, fn: Callable, path: str, file_data_path: str
|
|
@@ -6,6 +6,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
6
6
|
add_source_entry,
|
|
7
7
|
)
|
|
8
8
|
|
|
9
|
+
from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
|
|
10
|
+
from .airtable import airtable_source_entry
|
|
9
11
|
from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
|
|
10
12
|
from .astradb import astra_db_destination_entry
|
|
11
13
|
from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
|
|
@@ -92,3 +94,4 @@ add_destination_entry(
|
|
|
92
94
|
)
|
|
93
95
|
|
|
94
96
|
add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
|
|
97
|
+
add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
4
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
5
|
+
|
|
6
|
+
import pandas
|
|
7
|
+
from pydantic import BaseModel, Field, Secret, field_validator
|
|
8
|
+
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
+
from unstructured_ingest.v2.interfaces import (
|
|
11
|
+
AccessConfig,
|
|
12
|
+
ConnectionConfig,
|
|
13
|
+
Downloader,
|
|
14
|
+
DownloaderConfig,
|
|
15
|
+
FileData,
|
|
16
|
+
Indexer,
|
|
17
|
+
IndexerConfig,
|
|
18
|
+
SourceIdentifiers,
|
|
19
|
+
download_responses,
|
|
20
|
+
)
|
|
21
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
22
|
+
SourceRegistryEntry,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from pyairtable import Api
|
|
27
|
+
from pyairtable.api.types import RecordDict
|
|
28
|
+
|
|
29
|
+
CONNECTOR_TYPE = "airtable"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class AirtableTableMeta(BaseModel):
|
|
33
|
+
"""Metadata specifying a table id, a base id which the table is stored in,
|
|
34
|
+
and an t.Optional view id in case particular rows and fields are to be ingested"""
|
|
35
|
+
|
|
36
|
+
base_id: str
|
|
37
|
+
table_id: str
|
|
38
|
+
view_id: Optional[str] = None
|
|
39
|
+
|
|
40
|
+
def get_id(self) -> str:
|
|
41
|
+
id_s = f"{self.base_id}{self.table_id}"
|
|
42
|
+
id_s = f"{id_s}{self.view_id}" if self.view_id else id_s
|
|
43
|
+
return str(uuid5(NAMESPACE_DNS, id_s))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class AirtableAccessConfig(AccessConfig):
|
|
47
|
+
personal_access_token: str = Field(
|
|
48
|
+
description="Personal access token to authenticate into Airtable. Check: "
|
|
49
|
+
"https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
|
|
50
|
+
"for more info"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class AirtableConnectionConfig(ConnectionConfig):
|
|
55
|
+
access_config: Secret[AirtableAccessConfig]
|
|
56
|
+
|
|
57
|
+
@requires_dependencies(["pyairtable"], extras="airtable")
|
|
58
|
+
def get_client(self) -> "Api":
|
|
59
|
+
from pyairtable import Api
|
|
60
|
+
|
|
61
|
+
access_config = self.access_config.get_secret_value()
|
|
62
|
+
return Api(api_key=access_config.personal_access_token)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class AirtableIndexerConfig(IndexerConfig):
|
|
66
|
+
list_of_paths: Optional[list[str]] = Field(
|
|
67
|
+
default=None,
|
|
68
|
+
description="""
|
|
69
|
+
A list of paths that specify the locations to ingest data from within Airtable.
|
|
70
|
+
|
|
71
|
+
If this argument is not set, the connector ingests all tables within each and every base.
|
|
72
|
+
--list-of-paths: path1 path2 path3 ….
|
|
73
|
+
path: base_id/table_id(optional)/view_id(optional)/
|
|
74
|
+
|
|
75
|
+
To obtain (base, table, view) ids in bulk, check:
|
|
76
|
+
https://airtable.com/developers/web/api/list-bases (base ids)
|
|
77
|
+
https://airtable.com/developers/web/api/get-base-schema (table and view ids)
|
|
78
|
+
https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
|
|
79
|
+
|
|
80
|
+
To obtain specific ids from Airtable UI, go to your workspace, and copy any
|
|
81
|
+
relevant id from the URL structure:
|
|
82
|
+
https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
|
|
83
|
+
appAbcDeF1ghijKlm -> base_id
|
|
84
|
+
tblABcdEfG1HIJkLm -> table_id
|
|
85
|
+
viwABCDEfg6hijKLM -> view_id
|
|
86
|
+
|
|
87
|
+
You can also check: https://support.airtable.com/docs/finding-airtable-ids
|
|
88
|
+
|
|
89
|
+
Here is an example for one --list-of-paths:
|
|
90
|
+
base1/ → gets the entirety of all tables inside base1
|
|
91
|
+
base1/table1 → gets all rows and columns within table1 in base1
|
|
92
|
+
base1/table1/view1 → gets the rows and columns that are
|
|
93
|
+
visible in view1 for the table1 in base1
|
|
94
|
+
|
|
95
|
+
Examples to invalid airtable_paths:
|
|
96
|
+
table1 → has to mention base to be valid
|
|
97
|
+
base1/view1 → has to mention table to be valid
|
|
98
|
+
""",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def validate_path(cls, path: str):
|
|
103
|
+
components = path.split("/")
|
|
104
|
+
if len(components) > 3:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"Path must be of the format: base_id/table_id/view_id, "
|
|
107
|
+
f"where table id and view id are optional. Got: {path}"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
@field_validator("list_of_paths")
|
|
111
|
+
@classmethod
|
|
112
|
+
def validate_format(cls, v: list[str]) -> list[str]:
|
|
113
|
+
for path in v:
|
|
114
|
+
cls.validate_path(path=path)
|
|
115
|
+
return v
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class AirtableIndexer(Indexer):
|
|
120
|
+
connector_type: str = CONNECTOR_TYPE
|
|
121
|
+
connection_config: AirtableConnectionConfig
|
|
122
|
+
index_config: AirtableIndexerConfig
|
|
123
|
+
|
|
124
|
+
def get_all_table_meta(self) -> list[AirtableTableMeta]:
|
|
125
|
+
client = self.connection_config.get_client()
|
|
126
|
+
bases = client.bases()
|
|
127
|
+
airtable_meta = []
|
|
128
|
+
for base in bases:
|
|
129
|
+
for table in base.schema().tables:
|
|
130
|
+
airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
|
|
131
|
+
return airtable_meta
|
|
132
|
+
|
|
133
|
+
def get_base_tables_meta(self, base_id: str) -> list[AirtableTableMeta]:
|
|
134
|
+
client = self.connection_config.get_client()
|
|
135
|
+
base = client.base(base_id=base_id)
|
|
136
|
+
airtable_meta = []
|
|
137
|
+
for table in base.tables():
|
|
138
|
+
airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
|
|
139
|
+
return airtable_meta
|
|
140
|
+
|
|
141
|
+
def get_meta_from_list(self) -> list[AirtableTableMeta]:
|
|
142
|
+
airtable_meta = []
|
|
143
|
+
for path in self.index_config.list_of_paths:
|
|
144
|
+
components = path.split("/")
|
|
145
|
+
if len(components) == 1:
|
|
146
|
+
airtable_meta.extend(self.get_base_tables_meta(base_id=components[0]))
|
|
147
|
+
elif len(components) == 2:
|
|
148
|
+
airtable_meta.append(
|
|
149
|
+
AirtableTableMeta(base_id=components[0], table_id=components[1])
|
|
150
|
+
)
|
|
151
|
+
elif len(components) == 3:
|
|
152
|
+
airtable_meta.append(
|
|
153
|
+
AirtableTableMeta(
|
|
154
|
+
base_id=components[0], table_id=components[1], view_id=components[2]
|
|
155
|
+
)
|
|
156
|
+
)
|
|
157
|
+
else:
|
|
158
|
+
raise ValueError(
|
|
159
|
+
f"Path must be of the format: base_id/table_id/view_id, "
|
|
160
|
+
f"where table id and view id are optional. Got: {path}"
|
|
161
|
+
)
|
|
162
|
+
return airtable_meta
|
|
163
|
+
|
|
164
|
+
def get_table_metas(self) -> list[AirtableTableMeta]:
|
|
165
|
+
if not self.index_config.list_of_paths:
|
|
166
|
+
return self.get_all_table_meta()
|
|
167
|
+
return self.get_meta_from_list()
|
|
168
|
+
|
|
169
|
+
def precheck(self) -> None:
|
|
170
|
+
client = self.connection_config.get_client()
|
|
171
|
+
client.request(method="HEAD", url=client.build_url("meta", "bases"))
|
|
172
|
+
|
|
173
|
+
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
174
|
+
table_metas = self.get_table_metas()
|
|
175
|
+
for table_meta in table_metas:
|
|
176
|
+
fullpath = (
|
|
177
|
+
f"{table_meta.base_id}/{table_meta.table_id}/{table_meta.view_id}.csv"
|
|
178
|
+
if table_meta.view_id
|
|
179
|
+
else f"{table_meta.base_id}/{table_meta.table_id}.csv"
|
|
180
|
+
)
|
|
181
|
+
yield FileData(
|
|
182
|
+
identifier=table_meta.get_id(),
|
|
183
|
+
connector_type=CONNECTOR_TYPE,
|
|
184
|
+
additional_metadata=table_meta.dict(),
|
|
185
|
+
source_identifiers=SourceIdentifiers(
|
|
186
|
+
filename=str(Path(fullpath).name),
|
|
187
|
+
fullpath=fullpath,
|
|
188
|
+
),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class AirtableDownloaderConfig(DownloaderConfig):
|
|
193
|
+
pass
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
@dataclass
|
|
197
|
+
class AirtableDownloader(Downloader):
|
|
198
|
+
connection_config: AirtableConnectionConfig
|
|
199
|
+
download_config: AirtableDownloaderConfig = field(default_factory=AirtableDownloaderConfig)
|
|
200
|
+
connector_type: str = CONNECTOR_TYPE
|
|
201
|
+
|
|
202
|
+
def get_table_contents(self, table_meta: AirtableTableMeta) -> list["RecordDict"]:
|
|
203
|
+
client = self.connection_config.get_client()
|
|
204
|
+
table = client.table(base_id=table_meta.base_id, table_name=table_meta.table_id)
|
|
205
|
+
table_fetch_kwargs = {"view": table_meta.view_id} if table_meta.view_id else {}
|
|
206
|
+
rows = table.all(**table_fetch_kwargs)
|
|
207
|
+
return rows
|
|
208
|
+
|
|
209
|
+
def _table_row_to_dict(self, table_row: "RecordDict") -> dict:
|
|
210
|
+
row_dict = {
|
|
211
|
+
"id": table_row["id"],
|
|
212
|
+
"created_time": table_row["createdTime"],
|
|
213
|
+
}
|
|
214
|
+
row_dict.update(table_row["fields"])
|
|
215
|
+
return row_dict
|
|
216
|
+
|
|
217
|
+
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
218
|
+
table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
|
|
219
|
+
table_contents = self.get_table_contents(table_meta=table_meta)
|
|
220
|
+
df = pandas.DataFrame.from_dict(
|
|
221
|
+
data=[self._table_row_to_dict(table_row=row) for row in table_contents]
|
|
222
|
+
).sort_index(axis=1)
|
|
223
|
+
download_path = self.get_download_path(file_data=file_data)
|
|
224
|
+
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
225
|
+
df.to_csv(path_or_buf=download_path)
|
|
226
|
+
return self.generate_download_response(file_data=file_data, download_path=download_path)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
airtable_source_entry = SourceRegistryEntry(
|
|
230
|
+
indexer=AirtableIndexer,
|
|
231
|
+
indexer_config=AirtableIndexerConfig,
|
|
232
|
+
downloader=AirtableDownloader,
|
|
233
|
+
downloader_config=AirtableDownloaderConfig,
|
|
234
|
+
connection_config=AirtableConnectionConfig,
|
|
235
|
+
)
|
|
@@ -41,9 +41,14 @@ class ChromaAccessConfig(AccessConfig):
|
|
|
41
41
|
)
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
SecretChromaAccessConfig = Secret[ChromaAccessConfig]
|
|
45
|
+
|
|
46
|
+
|
|
44
47
|
class ChromaConnectionConfig(ConnectionConfig):
|
|
45
48
|
collection_name: str = Field(description="The name of the Chroma collection to write into.")
|
|
46
|
-
access_config:
|
|
49
|
+
access_config: SecretChromaAccessConfig = Field(
|
|
50
|
+
default=SecretChromaAccessConfig(secret_value=ChromaAccessConfig())
|
|
51
|
+
)
|
|
47
52
|
path: Optional[str] = Field(
|
|
48
53
|
default=None, description="Location where Chroma is persisted, if not connecting via http."
|
|
49
54
|
)
|
|
@@ -104,7 +104,7 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
|
|
|
104
104
|
elif access_config.es_api_key:
|
|
105
105
|
client_input_kwargs["api_key"] = access_config.es_api_key
|
|
106
106
|
client_input = ElasticsearchClientInput(**client_input_kwargs)
|
|
107
|
-
logger.debug(f"
|
|
107
|
+
logger.debug(f"elasticsearch client inputs mapped to: {client_input.dict()}")
|
|
108
108
|
client_kwargs = client_input.dict()
|
|
109
109
|
client_kwargs["basic_auth"] = (
|
|
110
110
|
client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
|
|
@@ -47,7 +47,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
|
|
|
47
47
|
connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
|
|
48
48
|
|
|
49
49
|
def get_access_config(self) -> dict[str, Any]:
|
|
50
|
-
# Return access_kwargs with oauth. The oauth object
|
|
50
|
+
# Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
|
|
51
51
|
# because it is not serializable.
|
|
52
52
|
from boxsdk import JWTAuth
|
|
53
53
|
|
|
@@ -317,9 +317,9 @@ class FsspecUploader(Uploader):
|
|
|
317
317
|
path_str = str(path.resolve())
|
|
318
318
|
upload_path = self.get_upload_path(file_data=file_data)
|
|
319
319
|
if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
|
|
320
|
-
logger.debug(f"
|
|
320
|
+
logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
|
|
321
321
|
return
|
|
322
|
-
logger.debug(f"
|
|
322
|
+
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
323
323
|
self.fs.upload(lpath=path_str, rpath=str(upload_path))
|
|
324
324
|
|
|
325
325
|
async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -328,7 +328,7 @@ class FsspecUploader(Uploader):
|
|
|
328
328
|
# Odd that fsspec doesn't run exists() as async even when client support async
|
|
329
329
|
already_exists = self.fs.exists(path=str(upload_path))
|
|
330
330
|
if already_exists and not self.upload_config.overwrite:
|
|
331
|
-
logger.debug(f"
|
|
331
|
+
logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
|
|
332
332
|
return
|
|
333
|
-
logger.debug(f"
|
|
333
|
+
logger.debug(f"writing local file {path_str} to {upload_path}")
|
|
334
334
|
self.fs.upload(lpath=path_str, rpath=str(upload_path))
|
|
@@ -28,8 +28,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
28
28
|
)
|
|
29
29
|
from unstructured_ingest.v2.logger import logger
|
|
30
30
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
31
|
-
|
|
32
|
-
from .utils import conform_string_to_dict
|
|
31
|
+
from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
|
|
33
32
|
|
|
34
33
|
CONNECTOR_TYPE = "google_drive"
|
|
35
34
|
|
|
@@ -200,7 +199,7 @@ class GoogleDriveIndexer(Indexer):
|
|
|
200
199
|
if extensions:
|
|
201
200
|
ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
|
|
202
201
|
q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
|
|
203
|
-
logger.debug(f"
|
|
202
|
+
logger.debug(f"query used when indexing: {q}")
|
|
204
203
|
logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
|
|
205
204
|
done = False
|
|
206
205
|
page_token = None
|
|
@@ -180,14 +180,15 @@ class LocalUploader(Uploader):
|
|
|
180
180
|
|
|
181
181
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
182
182
|
if source_identifiers := file_data.source_identifiers:
|
|
183
|
-
identifiers = source_identifiers
|
|
184
183
|
rel_path = (
|
|
185
|
-
|
|
186
|
-
if
|
|
187
|
-
else
|
|
184
|
+
source_identifiers.relative_path[1:]
|
|
185
|
+
if source_identifiers.relative_path.startswith("/")
|
|
186
|
+
else source_identifiers.relative_path
|
|
188
187
|
)
|
|
189
188
|
new_path = self.upload_config.output_path / Path(rel_path)
|
|
190
|
-
final_path = str(new_path).replace(
|
|
189
|
+
final_path = str(new_path).replace(
|
|
190
|
+
source_identifiers.filename, f"{source_identifiers.filename}.json"
|
|
191
|
+
)
|
|
191
192
|
else:
|
|
192
193
|
final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
|
|
193
194
|
Path(final_path).parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -71,7 +71,7 @@ class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
|
71
71
|
fields_to_include: Optional[list[str]] = None
|
|
72
72
|
"""If set - list of fields to include in the output.
|
|
73
73
|
Unspecified fields are removed from the elements.
|
|
74
|
-
This action
|
|
74
|
+
This action takes place after metadata flattening.
|
|
75
75
|
Missing fields will cause stager to throw KeyError."""
|
|
76
76
|
|
|
77
77
|
flatten_metadata: bool = True
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import json
|
|
2
4
|
from dataclasses import dataclass
|
|
3
5
|
from pathlib import Path
|
|
@@ -103,7 +105,7 @@ class OnedriveIndexer(Indexer):
|
|
|
103
105
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
104
106
|
raise SourceConnectionError(f"failed to validate connection: {e}")
|
|
105
107
|
|
|
106
|
-
def list_objects(self, folder, recursive) -> list["DriveItem"]:
|
|
108
|
+
def list_objects(self, folder: DriveItem, recursive: bool) -> list["DriveItem"]:
|
|
107
109
|
drive_items = folder.children.get().execute_query()
|
|
108
110
|
files = [d for d in drive_items if d.is_file]
|
|
109
111
|
if not recursive:
|
|
@@ -139,12 +141,12 @@ class OnedriveIndexer(Indexer):
|
|
|
139
141
|
server_path = file_path + "/" + filename
|
|
140
142
|
rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
|
|
141
143
|
date_modified_dt = (
|
|
142
|
-
parser.parse(drive_item.last_modified_datetime)
|
|
144
|
+
parser.parse(str(drive_item.last_modified_datetime))
|
|
143
145
|
if drive_item.last_modified_datetime
|
|
144
146
|
else None
|
|
145
147
|
)
|
|
146
148
|
date_created_at = (
|
|
147
|
-
parser.parse(drive_item.created_datetime) if drive_item.created_datetime else None
|
|
149
|
+
parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None
|
|
148
150
|
)
|
|
149
151
|
return FileData(
|
|
150
152
|
identifier=drive_item.id,
|
|
@@ -156,7 +158,7 @@ class OnedriveIndexer(Indexer):
|
|
|
156
158
|
url=drive_item.parent_reference.path + "/" + drive_item.name,
|
|
157
159
|
version=drive_item.etag,
|
|
158
160
|
date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
|
|
159
|
-
date_created=str(date_created_at.timestamp()) if
|
|
161
|
+
date_created=str(date_created_at.timestamp()) if date_created_at else None,
|
|
160
162
|
date_processed=str(time()),
|
|
161
163
|
record_locator={
|
|
162
164
|
"user_pname": self.connection_config.user_pname,
|
|
@@ -211,9 +213,9 @@ class OnedriveDownloader(Downloader):
|
|
|
211
213
|
fsize = file.get_property("size", 0)
|
|
212
214
|
download_path = self.get_download_path(file_data=file_data)
|
|
213
215
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
214
|
-
logger.info(f"
|
|
216
|
+
logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
|
|
215
217
|
if fsize > MAX_MB_SIZE:
|
|
216
|
-
logger.info(f"
|
|
218
|
+
logger.info(f"downloading file with size: {fsize} bytes in chunks")
|
|
217
219
|
with download_path.open(mode="wb") as f:
|
|
218
220
|
file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
|
|
219
221
|
else:
|
|
@@ -101,7 +101,7 @@ class OpenSearchConnectionConfig(ConnectionConfig):
|
|
|
101
101
|
if self.username and access_config.password:
|
|
102
102
|
client_input_kwargs["http_auth"] = (self.username, access_config.password)
|
|
103
103
|
client_input = OpenSearchClientInput(**client_input_kwargs)
|
|
104
|
-
logger.debug(f"
|
|
104
|
+
logger.debug(f"opensearch client inputs mapped to: {client_input.dict()}")
|
|
105
105
|
client_kwargs = client_input.dict()
|
|
106
106
|
if client_input.http_auth is not None:
|
|
107
107
|
client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
|
|
@@ -27,6 +27,7 @@ if TYPE_CHECKING:
|
|
|
27
27
|
|
|
28
28
|
CONNECTOR_TYPE = "pinecone"
|
|
29
29
|
MAX_PAYLOAD_SIZE = 2 * 1024 * 1024 # 2MB
|
|
30
|
+
MAX_POOL_THREADS = 100
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
class PineconeAccessConfig(AccessConfig):
|
|
@@ -45,7 +46,7 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
45
46
|
)
|
|
46
47
|
|
|
47
48
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
48
|
-
def get_index(self) -> "PineconeIndex":
|
|
49
|
+
def get_index(self, **index_kwargs) -> "PineconeIndex":
|
|
49
50
|
from pinecone import Pinecone
|
|
50
51
|
|
|
51
52
|
from unstructured_ingest import __version__ as unstructured_version
|
|
@@ -55,8 +56,8 @@ class PineconeConnectionConfig(ConnectionConfig):
|
|
|
55
56
|
source_tag=f"unstructured_ingest=={unstructured_version}",
|
|
56
57
|
)
|
|
57
58
|
|
|
58
|
-
index = pc.Index(self.index_name)
|
|
59
|
-
logger.debug(f"
|
|
59
|
+
index = pc.Index(name=self.index_name, **index_kwargs)
|
|
60
|
+
logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
|
|
60
61
|
return index
|
|
61
62
|
|
|
62
63
|
|
|
@@ -65,7 +66,13 @@ class PineconeUploadStagerConfig(UploadStagerConfig):
|
|
|
65
66
|
|
|
66
67
|
|
|
67
68
|
class PineconeUploaderConfig(UploaderConfig):
|
|
68
|
-
batch_size: int = Field(
|
|
69
|
+
batch_size: Optional[int] = Field(
|
|
70
|
+
default=None,
|
|
71
|
+
description="Optional number of records per batch. Will otherwise limit by size.",
|
|
72
|
+
)
|
|
73
|
+
pool_threads: Optional[int] = Field(
|
|
74
|
+
default=1, description="Optional limit on number of threads to use for upload"
|
|
75
|
+
)
|
|
69
76
|
|
|
70
77
|
|
|
71
78
|
ALLOWED_FIELDS = (
|
|
@@ -149,29 +156,44 @@ class PineconeUploader(Uploader):
|
|
|
149
156
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
150
157
|
|
|
151
158
|
@requires_dependencies(["pinecone"], extras="pinecone")
|
|
152
|
-
def
|
|
159
|
+
def upsert_batches_async(self, elements_dict: list[dict]):
|
|
153
160
|
from pinecone.exceptions import PineconeApiException
|
|
154
161
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
162
|
+
chunks = list(
|
|
163
|
+
generator_batching_wbytes(
|
|
164
|
+
iterable=elements_dict,
|
|
165
|
+
batch_size_limit_bytes=MAX_PAYLOAD_SIZE - 100,
|
|
166
|
+
max_batch_size=self.upload_config.batch_size,
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
logger.info(f"split doc with {len(elements_dict)} elements into {len(chunks)} batches")
|
|
170
|
+
|
|
171
|
+
max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
|
|
172
|
+
if self.upload_config.pool_threads:
|
|
173
|
+
pool_threads = min(self.upload_config.pool_threads, max_pool_threads)
|
|
174
|
+
else:
|
|
175
|
+
pool_threads = max_pool_threads
|
|
176
|
+
index = self.connection_config.get_index(pool_threads=pool_threads)
|
|
177
|
+
with index:
|
|
178
|
+
async_results = [index.upsert(vectors=chunk, async_req=True) for chunk in chunks]
|
|
179
|
+
# Wait for and retrieve responses (this raises in case of error)
|
|
180
|
+
try:
|
|
181
|
+
results = [async_result.get() for async_result in async_results]
|
|
182
|
+
except PineconeApiException as api_error:
|
|
183
|
+
raise DestinationConnectionError(f"http error: {api_error}") from api_error
|
|
184
|
+
logger.debug(f"results: {results}")
|
|
161
185
|
|
|
162
186
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
163
187
|
with path.open("r") as file:
|
|
164
188
|
elements_dict = json.load(file)
|
|
165
189
|
logger.info(
|
|
166
|
-
f"writing
|
|
190
|
+
f"writing a total of {len(elements_dict)} elements via"
|
|
191
|
+
f" document batches to destination"
|
|
167
192
|
f" index named {self.connection_config.index_name}"
|
|
168
193
|
f" with batch size {self.upload_config.batch_size}"
|
|
169
194
|
)
|
|
170
195
|
|
|
171
|
-
|
|
172
|
-
elements_dict, MAX_PAYLOAD_SIZE - 100, self.upload_config.batch_size
|
|
173
|
-
):
|
|
174
|
-
self.upsert_batch(batch=batch)
|
|
196
|
+
self.upsert_batches_async(elements_dict=elements_dict)
|
|
175
197
|
|
|
176
198
|
|
|
177
199
|
pinecone_destination_entry = DestinationRegistryEntry(
|
|
@@ -60,13 +60,16 @@ class SharepointAccessConfig(AccessConfig):
|
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
class SharepointPermissionsConfig(BaseModel):
|
|
63
|
-
permissions_application_id: str = Field(
|
|
64
|
-
|
|
63
|
+
permissions_application_id: Optional[str] = Field(
|
|
64
|
+
default=None, description="Microsoft Graph API application id"
|
|
65
|
+
)
|
|
66
|
+
permissions_tenant: Optional[str] = Field(
|
|
67
|
+
default=None,
|
|
65
68
|
description="url to get permissions data within tenant.",
|
|
66
69
|
examples=["https://contoso.onmicrosoft.com"],
|
|
67
70
|
)
|
|
68
|
-
permissions_client_cred: SecretStr = Field(
|
|
69
|
-
description="Microsoft Graph API application credentials"
|
|
71
|
+
permissions_client_cred: Optional[SecretStr] = Field(
|
|
72
|
+
default=None, description="Microsoft Graph API application credentials"
|
|
70
73
|
)
|
|
71
74
|
authority_url: Optional[SecretStr] = Field(
|
|
72
75
|
repr=False,
|
|
@@ -139,7 +142,7 @@ class SharepointConnectionConfig(ConnectionConfig):
|
|
|
139
142
|
|
|
140
143
|
class SharepointIndexerConfig(IndexerConfig):
|
|
141
144
|
path: Optional[str] = Field(
|
|
142
|
-
|
|
145
|
+
default=None,
|
|
143
146
|
description="Path from which to start parsing files. If the connector is to \
|
|
144
147
|
process all sites within the tenant this filter will be applied to \
|
|
145
148
|
all sites document libraries.",
|
|
@@ -335,7 +338,8 @@ class SharepointIndexer(Indexer):
|
|
|
335
338
|
@property
|
|
336
339
|
def process_permissions(self) -> bool:
|
|
337
340
|
return (
|
|
338
|
-
self.connection_config.permissions_config
|
|
341
|
+
self.connection_config.permissions_config is not None
|
|
342
|
+
and self.connection_config.permissions_config.permissions_tenant
|
|
339
343
|
and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
|
|
340
344
|
and self.connection_config.permissions_config.permissions_application_id
|
|
341
345
|
)
|