unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (82) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/interfaces.py +1 -1
  3. unstructured_ingest/cli/utils.py +1 -1
  4. unstructured_ingest/connector/astradb.py +1 -1
  5. unstructured_ingest/connector/biomed.py +4 -4
  6. unstructured_ingest/connector/chroma.py +1 -1
  7. unstructured_ingest/connector/databricks_volumes.py +2 -2
  8. unstructured_ingest/connector/fsspec/box.py +1 -1
  9. unstructured_ingest/connector/fsspec/fsspec.py +5 -5
  10. unstructured_ingest/connector/git.py +1 -1
  11. unstructured_ingest/connector/google_drive.py +4 -4
  12. unstructured_ingest/connector/hubspot.py +1 -1
  13. unstructured_ingest/connector/kafka.py +8 -8
  14. unstructured_ingest/connector/local.py +1 -1
  15. unstructured_ingest/connector/notion/helpers.py +4 -4
  16. unstructured_ingest/connector/onedrive.py +3 -3
  17. unstructured_ingest/connector/outlook.py +2 -2
  18. unstructured_ingest/connector/pinecone.py +1 -1
  19. unstructured_ingest/connector/sharepoint.py +8 -8
  20. unstructured_ingest/connector/vectara.py +6 -6
  21. unstructured_ingest/embed/__init__.py +17 -0
  22. unstructured_ingest/embed/bedrock.py +70 -0
  23. unstructured_ingest/embed/huggingface.py +73 -0
  24. unstructured_ingest/embed/interfaces.py +36 -0
  25. unstructured_ingest/embed/mixedbreadai.py +177 -0
  26. unstructured_ingest/embed/octoai.py +63 -0
  27. unstructured_ingest/embed/openai.py +61 -0
  28. unstructured_ingest/embed/vertexai.py +88 -0
  29. unstructured_ingest/embed/voyageai.py +69 -0
  30. unstructured_ingest/interfaces.py +21 -11
  31. unstructured_ingest/logger.py +1 -1
  32. unstructured_ingest/pipeline/copy.py +1 -1
  33. unstructured_ingest/pipeline/interfaces.py +2 -2
  34. unstructured_ingest/pipeline/partition.py +1 -1
  35. unstructured_ingest/pipeline/pipeline.py +1 -1
  36. unstructured_ingest/pipeline/reformat/chunking.py +2 -2
  37. unstructured_ingest/pipeline/reformat/embedding.py +4 -6
  38. unstructured_ingest/pipeline/source.py +2 -2
  39. unstructured_ingest/utils/compression.py +3 -3
  40. unstructured_ingest/utils/data_prep.py +20 -12
  41. unstructured_ingest/utils/string_and_date_utils.py +2 -2
  42. unstructured_ingest/v2/cli/base/cmd.py +3 -3
  43. unstructured_ingest/v2/cli/base/dest.py +1 -1
  44. unstructured_ingest/v2/cli/base/src.py +3 -2
  45. unstructured_ingest/v2/cli/utils/click.py +1 -1
  46. unstructured_ingest/v2/interfaces/processor.py +48 -13
  47. unstructured_ingest/v2/logger.py +1 -1
  48. unstructured_ingest/v2/otel.py +1 -1
  49. unstructured_ingest/v2/pipeline/interfaces.py +12 -3
  50. unstructured_ingest/v2/pipeline/pipeline.py +42 -29
  51. unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
  52. unstructured_ingest/v2/pipeline/steps/download.py +17 -2
  53. unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
  54. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  55. unstructured_ingest/v2/pipeline/steps/index.py +2 -2
  56. unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
  57. unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
  58. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  59. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  60. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  61. unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
  66. unstructured_ingest/v2/processes/connectors/local.py +6 -5
  67. unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
  68. unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
  69. unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
  70. unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
  71. unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
  72. unstructured_ingest/v2/processes/embedder.py +41 -24
  73. unstructured_ingest/v2/processes/filter.py +1 -1
  74. unstructured_ingest/v2/processes/partitioner.py +3 -3
  75. unstructured_ingest/v2/utils.py +7 -0
  76. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
  77. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
  78. unstructured_ingest/evaluate.py +0 -338
  79. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
  80. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
  81. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
  82. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0
@@ -31,7 +31,7 @@ class UploadStageStep(PipelineStep):
31
31
  self.process.upload_stager_config.json() if self.process.upload_stager_config else None
32
32
  )
33
33
  self.cache_dir.mkdir(parents=True, exist_ok=True)
34
- logger.info(f"Created {self.identifier} with configs: {config}")
34
+ logger.info(f"created {self.identifier} with configs: {config}")
35
35
 
36
36
  async def _run_async(
37
37
  self, fn: Callable, path: str, file_data_path: str
@@ -23,7 +23,7 @@ class UncompressStep(PipelineStep):
23
23
 
24
24
  def __post_init__(self):
25
25
  config = self.process.config.json() if self.process.config else None
26
- logger.info(f"Created {self.identifier} with configs: {config}")
26
+ logger.info(f"created {self.identifier} with configs: {config}")
27
27
 
28
28
  async def _run_async(
29
29
  self, fn: Callable, path: str, file_data_path: str
@@ -6,6 +6,8 @@ from unstructured_ingest.v2.processes.connector_registry import (
6
6
  add_source_entry,
7
7
  )
8
8
 
9
+ from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
10
+ from .airtable import airtable_source_entry
9
11
  from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
10
12
  from .astradb import astra_db_destination_entry
11
13
  from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
@@ -92,3 +94,4 @@ add_destination_entry(
92
94
  )
93
95
 
94
96
  add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
97
+ add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
@@ -0,0 +1,235 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Any, Generator, Optional
4
+ from uuid import NAMESPACE_DNS, uuid5
5
+
6
+ import pandas
7
+ from pydantic import BaseModel, Field, Secret, field_validator
8
+
9
+ from unstructured_ingest.utils.dep_check import requires_dependencies
10
+ from unstructured_ingest.v2.interfaces import (
11
+ AccessConfig,
12
+ ConnectionConfig,
13
+ Downloader,
14
+ DownloaderConfig,
15
+ FileData,
16
+ Indexer,
17
+ IndexerConfig,
18
+ SourceIdentifiers,
19
+ download_responses,
20
+ )
21
+ from unstructured_ingest.v2.processes.connector_registry import (
22
+ SourceRegistryEntry,
23
+ )
24
+
25
+ if TYPE_CHECKING:
26
+ from pyairtable import Api
27
+ from pyairtable.api.types import RecordDict
28
+
29
+ CONNECTOR_TYPE = "airtable"
30
+
31
+
32
+ class AirtableTableMeta(BaseModel):
33
+ """Metadata specifying a table id, a base id which the table is stored in,
34
+ and an t.Optional view id in case particular rows and fields are to be ingested"""
35
+
36
+ base_id: str
37
+ table_id: str
38
+ view_id: Optional[str] = None
39
+
40
+ def get_id(self) -> str:
41
+ id_s = f"{self.base_id}{self.table_id}"
42
+ id_s = f"{id_s}{self.view_id}" if self.view_id else id_s
43
+ return str(uuid5(NAMESPACE_DNS, id_s))
44
+
45
+
46
+ class AirtableAccessConfig(AccessConfig):
47
+ personal_access_token: str = Field(
48
+ description="Personal access token to authenticate into Airtable. Check: "
49
+ "https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens "
50
+ "for more info"
51
+ )
52
+
53
+
54
+ class AirtableConnectionConfig(ConnectionConfig):
55
+ access_config: Secret[AirtableAccessConfig]
56
+
57
+ @requires_dependencies(["pyairtable"], extras="airtable")
58
+ def get_client(self) -> "Api":
59
+ from pyairtable import Api
60
+
61
+ access_config = self.access_config.get_secret_value()
62
+ return Api(api_key=access_config.personal_access_token)
63
+
64
+
65
+ class AirtableIndexerConfig(IndexerConfig):
66
+ list_of_paths: Optional[list[str]] = Field(
67
+ default=None,
68
+ description="""
69
+ A list of paths that specify the locations to ingest data from within Airtable.
70
+
71
+ If this argument is not set, the connector ingests all tables within each and every base.
72
+ --list-of-paths: path1 path2 path3 ….
73
+ path: base_id/table_id(optional)/view_id(optional)/
74
+
75
+ To obtain (base, table, view) ids in bulk, check:
76
+ https://airtable.com/developers/web/api/list-bases (base ids)
77
+ https://airtable.com/developers/web/api/get-base-schema (table and view ids)
78
+ https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids)
79
+
80
+ To obtain specific ids from Airtable UI, go to your workspace, and copy any
81
+ relevant id from the URL structure:
82
+ https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM
83
+ appAbcDeF1ghijKlm -> base_id
84
+ tblABcdEfG1HIJkLm -> table_id
85
+ viwABCDEfg6hijKLM -> view_id
86
+
87
+ You can also check: https://support.airtable.com/docs/finding-airtable-ids
88
+
89
+ Here is an example for one --list-of-paths:
90
+ base1/ → gets the entirety of all tables inside base1
91
+ base1/table1 → gets all rows and columns within table1 in base1
92
+ base1/table1/view1 → gets the rows and columns that are
93
+ visible in view1 for the table1 in base1
94
+
95
+ Examples to invalid airtable_paths:
96
+ table1 → has to mention base to be valid
97
+ base1/view1 → has to mention table to be valid
98
+ """,
99
+ )
100
+
101
+ @classmethod
102
+ def validate_path(cls, path: str):
103
+ components = path.split("/")
104
+ if len(components) > 3:
105
+ raise ValueError(
106
+ f"Path must be of the format: base_id/table_id/view_id, "
107
+ f"where table id and view id are optional. Got: {path}"
108
+ )
109
+
110
+ @field_validator("list_of_paths")
111
+ @classmethod
112
+ def validate_format(cls, v: list[str]) -> list[str]:
113
+ for path in v:
114
+ cls.validate_path(path=path)
115
+ return v
116
+
117
+
118
+ @dataclass
119
+ class AirtableIndexer(Indexer):
120
+ connector_type: str = CONNECTOR_TYPE
121
+ connection_config: AirtableConnectionConfig
122
+ index_config: AirtableIndexerConfig
123
+
124
+ def get_all_table_meta(self) -> list[AirtableTableMeta]:
125
+ client = self.connection_config.get_client()
126
+ bases = client.bases()
127
+ airtable_meta = []
128
+ for base in bases:
129
+ for table in base.schema().tables:
130
+ airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
131
+ return airtable_meta
132
+
133
+ def get_base_tables_meta(self, base_id: str) -> list[AirtableTableMeta]:
134
+ client = self.connection_config.get_client()
135
+ base = client.base(base_id=base_id)
136
+ airtable_meta = []
137
+ for table in base.tables():
138
+ airtable_meta.append(AirtableTableMeta(base_id=base.id, table_id=table.id))
139
+ return airtable_meta
140
+
141
+ def get_meta_from_list(self) -> list[AirtableTableMeta]:
142
+ airtable_meta = []
143
+ for path in self.index_config.list_of_paths:
144
+ components = path.split("/")
145
+ if len(components) == 1:
146
+ airtable_meta.extend(self.get_base_tables_meta(base_id=components[0]))
147
+ elif len(components) == 2:
148
+ airtable_meta.append(
149
+ AirtableTableMeta(base_id=components[0], table_id=components[1])
150
+ )
151
+ elif len(components) == 3:
152
+ airtable_meta.append(
153
+ AirtableTableMeta(
154
+ base_id=components[0], table_id=components[1], view_id=components[2]
155
+ )
156
+ )
157
+ else:
158
+ raise ValueError(
159
+ f"Path must be of the format: base_id/table_id/view_id, "
160
+ f"where table id and view id are optional. Got: {path}"
161
+ )
162
+ return airtable_meta
163
+
164
+ def get_table_metas(self) -> list[AirtableTableMeta]:
165
+ if not self.index_config.list_of_paths:
166
+ return self.get_all_table_meta()
167
+ return self.get_meta_from_list()
168
+
169
+ def precheck(self) -> None:
170
+ client = self.connection_config.get_client()
171
+ client.request(method="HEAD", url=client.build_url("meta", "bases"))
172
+
173
+ def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
174
+ table_metas = self.get_table_metas()
175
+ for table_meta in table_metas:
176
+ fullpath = (
177
+ f"{table_meta.base_id}/{table_meta.table_id}/{table_meta.view_id}.csv"
178
+ if table_meta.view_id
179
+ else f"{table_meta.base_id}/{table_meta.table_id}.csv"
180
+ )
181
+ yield FileData(
182
+ identifier=table_meta.get_id(),
183
+ connector_type=CONNECTOR_TYPE,
184
+ additional_metadata=table_meta.dict(),
185
+ source_identifiers=SourceIdentifiers(
186
+ filename=str(Path(fullpath).name),
187
+ fullpath=fullpath,
188
+ ),
189
+ )
190
+
191
+
192
+ class AirtableDownloaderConfig(DownloaderConfig):
193
+ pass
194
+
195
+
196
+ @dataclass
197
+ class AirtableDownloader(Downloader):
198
+ connection_config: AirtableConnectionConfig
199
+ download_config: AirtableDownloaderConfig = field(default_factory=AirtableDownloaderConfig)
200
+ connector_type: str = CONNECTOR_TYPE
201
+
202
+ def get_table_contents(self, table_meta: AirtableTableMeta) -> list["RecordDict"]:
203
+ client = self.connection_config.get_client()
204
+ table = client.table(base_id=table_meta.base_id, table_name=table_meta.table_id)
205
+ table_fetch_kwargs = {"view": table_meta.view_id} if table_meta.view_id else {}
206
+ rows = table.all(**table_fetch_kwargs)
207
+ return rows
208
+
209
+ def _table_row_to_dict(self, table_row: "RecordDict") -> dict:
210
+ row_dict = {
211
+ "id": table_row["id"],
212
+ "created_time": table_row["createdTime"],
213
+ }
214
+ row_dict.update(table_row["fields"])
215
+ return row_dict
216
+
217
+ def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
218
+ table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
219
+ table_contents = self.get_table_contents(table_meta=table_meta)
220
+ df = pandas.DataFrame.from_dict(
221
+ data=[self._table_row_to_dict(table_row=row) for row in table_contents]
222
+ ).sort_index(axis=1)
223
+ download_path = self.get_download_path(file_data=file_data)
224
+ download_path.parent.mkdir(parents=True, exist_ok=True)
225
+ df.to_csv(path_or_buf=download_path)
226
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
227
+
228
+
229
+ airtable_source_entry = SourceRegistryEntry(
230
+ indexer=AirtableIndexer,
231
+ indexer_config=AirtableIndexerConfig,
232
+ downloader=AirtableDownloader,
233
+ downloader_config=AirtableDownloaderConfig,
234
+ connection_config=AirtableConnectionConfig,
235
+ )
@@ -41,9 +41,14 @@ class ChromaAccessConfig(AccessConfig):
41
41
  )
42
42
 
43
43
 
44
+ SecretChromaAccessConfig = Secret[ChromaAccessConfig]
45
+
46
+
44
47
  class ChromaConnectionConfig(ConnectionConfig):
45
48
  collection_name: str = Field(description="The name of the Chroma collection to write into.")
46
- access_config: Secret[ChromaAccessConfig]
49
+ access_config: SecretChromaAccessConfig = Field(
50
+ default=SecretChromaAccessConfig(secret_value=ChromaAccessConfig())
51
+ )
47
52
  path: Optional[str] = Field(
48
53
  default=None, description="Location where Chroma is persisted, if not connecting via http."
49
54
  )
@@ -104,7 +104,7 @@ class ElasticsearchConnectionConfig(ConnectionConfig):
104
104
  elif access_config.es_api_key:
105
105
  client_input_kwargs["api_key"] = access_config.es_api_key
106
106
  client_input = ElasticsearchClientInput(**client_input_kwargs)
107
- logger.debug(f"Elasticsearch client inputs mapped to: {client_input.dict()}")
107
+ logger.debug(f"elasticsearch client inputs mapped to: {client_input.dict()}")
108
108
  client_kwargs = client_input.dict()
109
109
  client_kwargs["basic_auth"] = (
110
110
  client_input.basic_auth.get_secret_value() if client_input.basic_auth else None
@@ -47,7 +47,7 @@ class BoxConnectionConfig(FsspecConnectionConfig):
47
47
  connector_type: str = Field(default=CONNECTOR_TYPE, init=False)
48
48
 
49
49
  def get_access_config(self) -> dict[str, Any]:
50
- # Return access_kwargs with oauth. The oauth object can not be stored directly in the config
50
+ # Return access_kwargs with oauth. The oauth object cannot be stored directly in the config
51
51
  # because it is not serializable.
52
52
  from boxsdk import JWTAuth
53
53
 
@@ -317,9 +317,9 @@ class FsspecUploader(Uploader):
317
317
  path_str = str(path.resolve())
318
318
  upload_path = self.get_upload_path(file_data=file_data)
319
319
  if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
320
- logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
320
+ logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
321
321
  return
322
- logger.debug(f"Writing local file {path_str} to {upload_path}")
322
+ logger.debug(f"writing local file {path_str} to {upload_path}")
323
323
  self.fs.upload(lpath=path_str, rpath=str(upload_path))
324
324
 
325
325
  async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -328,7 +328,7 @@ class FsspecUploader(Uploader):
328
328
  # Odd that fsspec doesn't run exists() as async even when client support async
329
329
  already_exists = self.fs.exists(path=str(upload_path))
330
330
  if already_exists and not self.upload_config.overwrite:
331
- logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
331
+ logger.debug(f"skipping upload of {path} to {upload_path}, file already exists")
332
332
  return
333
- logger.debug(f"Writing local file {path_str} to {upload_path}")
333
+ logger.debug(f"writing local file {path_str} to {upload_path}")
334
334
  self.fs.upload(lpath=path_str, rpath=str(upload_path))
@@ -28,8 +28,7 @@ from unstructured_ingest.v2.interfaces import (
28
28
  )
29
29
  from unstructured_ingest.v2.logger import logger
30
30
  from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
31
-
32
- from .utils import conform_string_to_dict
31
+ from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
33
32
 
34
33
  CONNECTOR_TYPE = "google_drive"
35
34
 
@@ -200,7 +199,7 @@ class GoogleDriveIndexer(Indexer):
200
199
  if extensions:
201
200
  ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
202
201
  q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
203
- logger.debug(f"Query used when indexing: {q}")
202
+ logger.debug(f"query used when indexing: {q}")
204
203
  logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
205
204
  done = False
206
205
  page_token = None
@@ -180,14 +180,15 @@ class LocalUploader(Uploader):
180
180
 
181
181
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
182
182
  if source_identifiers := file_data.source_identifiers:
183
- identifiers = source_identifiers
184
183
  rel_path = (
185
- identifiers.relative_path[1:]
186
- if identifiers.relative_path.startswith("/")
187
- else identifiers.relative_path
184
+ source_identifiers.relative_path[1:]
185
+ if source_identifiers.relative_path.startswith("/")
186
+ else source_identifiers.relative_path
188
187
  )
189
188
  new_path = self.upload_config.output_path / Path(rel_path)
190
- final_path = str(new_path).replace(identifiers.filename, f"{identifiers.filename}.json")
189
+ final_path = str(new_path).replace(
190
+ source_identifiers.filename, f"{source_identifiers.filename}.json"
191
+ )
191
192
  else:
192
193
  final_path = self.upload_config.output_path / Path(f"{file_data.identifier}.json")
193
194
  Path(final_path).parent.mkdir(parents=True, exist_ok=True)
@@ -71,7 +71,7 @@ class MilvusUploadStagerConfig(UploadStagerConfig):
71
71
  fields_to_include: Optional[list[str]] = None
72
72
  """If set - list of fields to include in the output.
73
73
  Unspecified fields are removed from the elements.
74
- This action takse place after metadata flattening.
74
+ This action takes place after metadata flattening.
75
75
  Missing fields will cause stager to throw KeyError."""
76
76
 
77
77
  flatten_metadata: bool = True
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  from dataclasses import dataclass
3
5
  from pathlib import Path
@@ -103,7 +105,7 @@ class OnedriveIndexer(Indexer):
103
105
  logger.error(f"failed to validate connection: {e}", exc_info=True)
104
106
  raise SourceConnectionError(f"failed to validate connection: {e}")
105
107
 
106
- def list_objects(self, folder, recursive) -> list["DriveItem"]:
108
+ def list_objects(self, folder: DriveItem, recursive: bool) -> list["DriveItem"]:
107
109
  drive_items = folder.children.get().execute_query()
108
110
  files = [d for d in drive_items if d.is_file]
109
111
  if not recursive:
@@ -139,12 +141,12 @@ class OnedriveIndexer(Indexer):
139
141
  server_path = file_path + "/" + filename
140
142
  rel_path = server_path.replace(self.index_config.path, "").lstrip("/")
141
143
  date_modified_dt = (
142
- parser.parse(drive_item.last_modified_datetime)
144
+ parser.parse(str(drive_item.last_modified_datetime))
143
145
  if drive_item.last_modified_datetime
144
146
  else None
145
147
  )
146
148
  date_created_at = (
147
- parser.parse(drive_item.created_datetime) if drive_item.created_datetime else None
149
+ parser.parse(str(drive_item.created_datetime)) if drive_item.created_datetime else None
148
150
  )
149
151
  return FileData(
150
152
  identifier=drive_item.id,
@@ -156,7 +158,7 @@ class OnedriveIndexer(Indexer):
156
158
  url=drive_item.parent_reference.path + "/" + drive_item.name,
157
159
  version=drive_item.etag,
158
160
  date_modified=str(date_modified_dt.timestamp()) if date_modified_dt else None,
159
- date_created=str(date_created_at.timestamp()) if date_modified_dt else None,
161
+ date_created=str(date_created_at.timestamp()) if date_created_at else None,
160
162
  date_processed=str(time()),
161
163
  record_locator={
162
164
  "user_pname": self.connection_config.user_pname,
@@ -211,9 +213,9 @@ class OnedriveDownloader(Downloader):
211
213
  fsize = file.get_property("size", 0)
212
214
  download_path = self.get_download_path(file_data=file_data)
213
215
  download_path.parent.mkdir(parents=True, exist_ok=True)
214
- logger.info(f"Downloading {file_data.source_identifiers.fullpath} to {download_path}")
216
+ logger.info(f"downloading {file_data.source_identifiers.fullpath} to {download_path}")
215
217
  if fsize > MAX_MB_SIZE:
216
- logger.info(f"Downloading file with size: {fsize} bytes in chunks")
218
+ logger.info(f"downloading file with size: {fsize} bytes in chunks")
217
219
  with download_path.open(mode="wb") as f:
218
220
  file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query()
219
221
  else:
@@ -101,7 +101,7 @@ class OpenSearchConnectionConfig(ConnectionConfig):
101
101
  if self.username and access_config.password:
102
102
  client_input_kwargs["http_auth"] = (self.username, access_config.password)
103
103
  client_input = OpenSearchClientInput(**client_input_kwargs)
104
- logger.debug(f"OpenSearch client inputs mapped to: {client_input.dict()}")
104
+ logger.debug(f"opensearch client inputs mapped to: {client_input.dict()}")
105
105
  client_kwargs = client_input.dict()
106
106
  if client_input.http_auth is not None:
107
107
  client_kwargs["http_auth"] = client_input.http_auth.get_secret_value()
@@ -27,6 +27,7 @@ if TYPE_CHECKING:
27
27
 
28
28
  CONNECTOR_TYPE = "pinecone"
29
29
  MAX_PAYLOAD_SIZE = 2 * 1024 * 1024 # 2MB
30
+ MAX_POOL_THREADS = 100
30
31
 
31
32
 
32
33
  class PineconeAccessConfig(AccessConfig):
@@ -45,7 +46,7 @@ class PineconeConnectionConfig(ConnectionConfig):
45
46
  )
46
47
 
47
48
  @requires_dependencies(["pinecone"], extras="pinecone")
48
- def get_index(self) -> "PineconeIndex":
49
+ def get_index(self, **index_kwargs) -> "PineconeIndex":
49
50
  from pinecone import Pinecone
50
51
 
51
52
  from unstructured_ingest import __version__ as unstructured_version
@@ -55,8 +56,8 @@ class PineconeConnectionConfig(ConnectionConfig):
55
56
  source_tag=f"unstructured_ingest=={unstructured_version}",
56
57
  )
57
58
 
58
- index = pc.Index(self.index_name)
59
- logger.debug(f"Connected to index: {pc.describe_index(self.index_name)}")
59
+ index = pc.Index(name=self.index_name, **index_kwargs)
60
+ logger.debug(f"connected to index: {pc.describe_index(self.index_name)}")
60
61
  return index
61
62
 
62
63
 
@@ -65,7 +66,13 @@ class PineconeUploadStagerConfig(UploadStagerConfig):
65
66
 
66
67
 
67
68
  class PineconeUploaderConfig(UploaderConfig):
68
- batch_size: int = Field(default=100, description="Number of records per batch")
69
+ batch_size: Optional[int] = Field(
70
+ default=None,
71
+ description="Optional number of records per batch. Will otherwise limit by size.",
72
+ )
73
+ pool_threads: Optional[int] = Field(
74
+ default=1, description="Optional limit on number of threads to use for upload"
75
+ )
69
76
 
70
77
 
71
78
  ALLOWED_FIELDS = (
@@ -149,29 +156,44 @@ class PineconeUploader(Uploader):
149
156
  raise DestinationConnectionError(f"failed to validate connection: {e}")
150
157
 
151
158
  @requires_dependencies(["pinecone"], extras="pinecone")
152
- def upsert_batch(self, batch):
159
+ def upsert_batches_async(self, elements_dict: list[dict]):
153
160
  from pinecone.exceptions import PineconeApiException
154
161
 
155
- try:
156
- index = self.connection_config.get_index()
157
- response = index.upsert(batch)
158
- except PineconeApiException as api_error:
159
- raise DestinationConnectionError(f"http error: {api_error}") from api_error
160
- logger.debug(f"results: {response}")
162
+ chunks = list(
163
+ generator_batching_wbytes(
164
+ iterable=elements_dict,
165
+ batch_size_limit_bytes=MAX_PAYLOAD_SIZE - 100,
166
+ max_batch_size=self.upload_config.batch_size,
167
+ )
168
+ )
169
+ logger.info(f"split doc with {len(elements_dict)} elements into {len(chunks)} batches")
170
+
171
+ max_pool_threads = min(len(chunks), MAX_POOL_THREADS)
172
+ if self.upload_config.pool_threads:
173
+ pool_threads = min(self.upload_config.pool_threads, max_pool_threads)
174
+ else:
175
+ pool_threads = max_pool_threads
176
+ index = self.connection_config.get_index(pool_threads=pool_threads)
177
+ with index:
178
+ async_results = [index.upsert(vectors=chunk, async_req=True) for chunk in chunks]
179
+ # Wait for and retrieve responses (this raises in case of error)
180
+ try:
181
+ results = [async_result.get() for async_result in async_results]
182
+ except PineconeApiException as api_error:
183
+ raise DestinationConnectionError(f"http error: {api_error}") from api_error
184
+ logger.debug(f"results: {results}")
161
185
 
162
186
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
163
187
  with path.open("r") as file:
164
188
  elements_dict = json.load(file)
165
189
  logger.info(
166
- f"writing document batches to destination"
190
+ f"writing a total of {len(elements_dict)} elements via"
191
+ f" document batches to destination"
167
192
  f" index named {self.connection_config.index_name}"
168
193
  f" with batch size {self.upload_config.batch_size}"
169
194
  )
170
195
 
171
- for batch in generator_batching_wbytes(
172
- elements_dict, MAX_PAYLOAD_SIZE - 100, self.upload_config.batch_size
173
- ):
174
- self.upsert_batch(batch=batch)
196
+ self.upsert_batches_async(elements_dict=elements_dict)
175
197
 
176
198
 
177
199
  pinecone_destination_entry = DestinationRegistryEntry(
@@ -60,13 +60,16 @@ class SharepointAccessConfig(AccessConfig):
60
60
 
61
61
 
62
62
  class SharepointPermissionsConfig(BaseModel):
63
- permissions_application_id: str = Field(description="Microsoft Graph API application id")
64
- permissions_tenant: str = Field(
63
+ permissions_application_id: Optional[str] = Field(
64
+ default=None, description="Microsoft Graph API application id"
65
+ )
66
+ permissions_tenant: Optional[str] = Field(
67
+ default=None,
65
68
  description="url to get permissions data within tenant.",
66
69
  examples=["https://contoso.onmicrosoft.com"],
67
70
  )
68
- permissions_client_cred: SecretStr = Field(
69
- description="Microsoft Graph API application credentials"
71
+ permissions_client_cred: Optional[SecretStr] = Field(
72
+ default=None, description="Microsoft Graph API application credentials"
70
73
  )
71
74
  authority_url: Optional[SecretStr] = Field(
72
75
  repr=False,
@@ -139,7 +142,7 @@ class SharepointConnectionConfig(ConnectionConfig):
139
142
 
140
143
  class SharepointIndexerConfig(IndexerConfig):
141
144
  path: Optional[str] = Field(
142
- defaul=None,
145
+ default=None,
143
146
  description="Path from which to start parsing files. If the connector is to \
144
147
  process all sites within the tenant this filter will be applied to \
145
148
  all sites document libraries.",
@@ -335,7 +338,8 @@ class SharepointIndexer(Indexer):
335
338
  @property
336
339
  def process_permissions(self) -> bool:
337
340
  return (
338
- self.connection_config.permissions_config.permissions_tenant
341
+ self.connection_config.permissions_config is not None
342
+ and self.connection_config.permissions_config.permissions_tenant
339
343
  and self.connection_config.permissions_config.permissions_client_cred.get_secret_value()
340
344
  and self.connection_config.permissions_config.permissions_application_id
341
345
  )