unstructured-ingest 1.0.8__py3-none-any.whl → 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.8" # pragma: no cover
1
+ __version__ = "1.0.11" # pragma: no cover
@@ -153,7 +153,13 @@ class GoogleDriveIndexer(Indexer):
153
153
  """
154
154
  try:
155
155
  # A very minimal call: list 1 file from the drive.
156
- client.list(spaces="drive", pageSize=1, fields="files(id)").execute()
156
+ client.list(
157
+ supportsAllDrives=True,
158
+ includeItemsFromAllDrives=True,
159
+ spaces="drive",
160
+ pageSize=1,
161
+ fields="files(id)",
162
+ ).execute()
157
163
  except HttpError as e:
158
164
  error_content = e.content.decode() if hasattr(e, "content") else ""
159
165
  lower_error = error_content.lower()
@@ -183,6 +189,8 @@ class GoogleDriveIndexer(Indexer):
183
189
  page_token = None
184
190
  while True:
185
191
  response = files_client.list(
192
+ supportsAllDrives=True,
193
+ includeItemsFromAllDrives=True,
186
194
  spaces="drive",
187
195
  q=query,
188
196
  fields="nextPageToken, files(id, mimeType, fileExtension)",
@@ -251,6 +259,8 @@ class GoogleDriveIndexer(Indexer):
251
259
  else:
252
260
  # Non-recursive: check for at least one immediate non-folder child.
253
261
  response = client.list(
262
+ supportsAllDrives=True,
263
+ includeItemsFromAllDrives=True,
254
264
  spaces="drive",
255
265
  fields="files(id)",
256
266
  pageSize=1,
@@ -348,6 +358,8 @@ class GoogleDriveIndexer(Indexer):
348
358
  files_response = []
349
359
  while not done:
350
360
  response: dict = files_client.list(
361
+ supportsAllDrives=True,
362
+ includeItemsFromAllDrives=True,
351
363
  spaces="drive",
352
364
  fields=fields_input,
353
365
  corpora="user",
@@ -381,7 +393,9 @@ class GoogleDriveIndexer(Indexer):
381
393
  return files_response
382
394
 
383
395
  def get_root_info(self, files_client, object_id: str) -> dict:
384
- return files_client.get(fileId=object_id, fields=",".join(self.fields)).execute()
396
+ return files_client.get(
397
+ supportsAllDrives=True, fileId=object_id, fields=",".join(self.fields)
398
+ ).execute()
385
399
 
386
400
  def get_files(
387
401
  self,
@@ -172,7 +172,7 @@ class IbmWatsonxUploaderConfig(UploaderConfig):
172
172
  namespace: str = Field(description="Namespace name")
173
173
  table: str = Field(description="Table name")
174
174
  max_retries: int = Field(
175
- default=5, description="Maximum number of retries to upload data", ge=2, le=10
175
+ default=5, description="Maximum number of retries to upload data", ge=2, le=500
176
176
  )
177
177
  record_id_key: str = Field(
178
178
  default=RECORD_ID_LABEL,
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import re
3
3
  from dataclasses import dataclass, field
4
+ from pathlib import Path
4
5
  from typing import TYPE_CHECKING, Any, Literal, Optional
5
6
 
6
7
  from pydantic import Field, Secret
@@ -18,11 +19,14 @@ from unstructured_ingest.interfaces import (
18
19
  )
19
20
  from unstructured_ingest.logger import logger
20
21
  from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
22
+ from unstructured_ingest.utils import ndjson
21
23
  from unstructured_ingest.utils.constants import RECORD_ID_LABEL
22
24
  from unstructured_ingest.utils.data_prep import (
23
25
  flatten_dict,
24
26
  generator_batching_wbytes,
25
27
  get_enhanced_element_id,
28
+ get_json_data,
29
+ write_data,
26
30
  )
27
31
  from unstructured_ingest.utils.dep_check import requires_dependencies
28
32
 
@@ -162,6 +166,28 @@ class PineconeUploadStager(UploadStager):
162
166
  "metadata": metadata,
163
167
  }
164
168
 
169
+ def stream_update(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
170
+ with input_file.open() as in_f:
171
+ reader = ndjson.reader(in_f)
172
+ with output_file.open("w") as out_f:
173
+ writer = ndjson.writer(out_f)
174
+ for element in reader:
175
+ if "embeddings" not in element:
176
+ continue
177
+ conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
178
+ writer.write(row=conformed_element)
179
+ writer.f.flush()
180
+
181
+ def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
182
+ elements_contents = get_json_data(path=input_file)
183
+
184
+ conformed_elements = [
185
+ self.conform_dict(element_dict=element, file_data=file_data)
186
+ for element in elements_contents
187
+ if "embeddings" in element
188
+ ]
189
+ write_data(path=output_file, data=conformed_elements)
190
+
165
191
 
166
192
  @dataclass
167
193
  class PineconeUploader(VectorDBUploader):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.8
3
+ Version: 1.0.11
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=Ca4MzLfEjKrGXx21Kyt3Ve65pu59qVvEIU_io-qxQ9o,42
2
+ unstructured_ingest/__version__.py,sha256=T2mg8xL1j5lQc6Bl6xC62EjS3ZMzCbWBXW81Kgto5vs,43
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -72,7 +72,7 @@ unstructured_ingest/processes/connectors/delta_table.py,sha256=2DFox_Vzoopt_D3Jy
72
72
  unstructured_ingest/processes/connectors/discord.py,sha256=6yEJ_agfKUqsV43wFsbMkcd8lcLJC0uqbo4izjdZ3rU,5294
73
73
  unstructured_ingest/processes/connectors/github.py,sha256=smHCz6jOH1p_hW2S25bYunBBj_pYjz8HTw6wkzaJz_A,7765
74
74
  unstructured_ingest/processes/connectors/gitlab.py,sha256=6h1CdqznJmzeWxGfXrFLdNdT23PExGnUMMX7usK_4Kk,10013
75
- unstructured_ingest/processes/connectors/google_drive.py,sha256=YMuobb9AZq-0pp6rbYyXNizs3jA6z3nQjO9cK_nncXQ,21936
75
+ unstructured_ingest/processes/connectors/google_drive.py,sha256=-bpOd2kIffghRBCmhEXQ0TPRZAKoE6hFtzA7hfw6QN0,22421
76
76
  unstructured_ingest/processes/connectors/jira.py,sha256=eG8yTn8ZVEz7rBJ-ha8i_d9hEh6VALN6QJT_vbYvbL0,17142
77
77
  unstructured_ingest/processes/connectors/kdbai.py,sha256=XhxYpKSAoFPBsDQWwNuLX03DCxOVr7yquj9VYM55Rtc,5174
78
78
  unstructured_ingest/processes/connectors/local.py,sha256=LluTLKv4g7FbJb4A6vuSxI9VhzKZuuQUpDS-cVNAQ2g,7426
@@ -81,7 +81,7 @@ unstructured_ingest/processes/connectors/mongodb.py,sha256=1g_5bfbS6lah3nsOXqLAa
81
81
  unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
82
82
  unstructured_ingest/processes/connectors/onedrive.py,sha256=VBkKlbJgR7uKlKTnjNybAw6ZawLKflDPpy2uVvgWYWw,19296
83
83
  unstructured_ingest/processes/connectors/outlook.py,sha256=zHM5frO7CqQG0-KcTyX49aZeSlsvVrl8kh_lR_ESgQw,9275
84
- unstructured_ingest/processes/connectors/pinecone.py,sha256=BdO1PS_Y6FOeL-7uPl-Eh6ij1wHOwMkopOzKQGQ9Ac0,13979
84
+ unstructured_ingest/processes/connectors/pinecone.py,sha256=pSREUNsQqel6q1EFZsFWelg-uZgGubQY5m_6nVnBFKs,15090
85
85
  unstructured_ingest/processes/connectors/redisdb.py,sha256=YzvSlfHs83XWsWMaIC3bV5enKfxejMQ9BQ8CtXfnJ5o,6923
86
86
  unstructured_ingest/processes/connectors/salesforce.py,sha256=OaKEWCqZrirHqFJ650K5jSPwYlWefPOapas8Y-4D9oc,11661
87
87
  unstructured_ingest/processes/connectors/sharepoint.py,sha256=PowaqMzWr-VCW1rnwcAeRhHyE55kJ9J9FCVlrmtzN0E,4827
@@ -115,7 +115,7 @@ unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=2ZV6b2E2pIsf_ab1Lty
115
115
  unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
116
116
  unstructured_ingest/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
117
117
  unstructured_ingest/processes/connectors/ibm_watsonx/__init__.py,sha256=kf0UpgdAY2KK1R1FbAB6GEBBAIOeYQ8cZIr3bp660qM,374
118
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py,sha256=SpZIqjreXpLTpZfezhG6xkZ_h7w-QWmdjXDBG6mlddQ,11729
118
+ unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py,sha256=cQvtLVKCTiqFeImz2OwvgdFL7YnMwLLT1_B_ueHy8Qo,11730
119
119
  unstructured_ingest/processes/connectors/kafka/__init__.py,sha256=pFN2cWwAStiGTAsQ616GIWKi_hDv0s74ZvNqhJEp1Pc,751
120
120
  unstructured_ingest/processes/connectors/kafka/cloud.py,sha256=Ki6iOLoZ86tYWdnLnMWYvb2hUCneKqo4mTJcfXh7YoQ,3432
121
121
  unstructured_ingest/processes/connectors/kafka/kafka.py,sha256=7NMvWijfoliyAgnmz8TM8oJt5x7RDzC-ABPdYAm7J3w,10306
@@ -231,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
231
231
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
232
232
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
233
233
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
234
- unstructured_ingest-1.0.8.dist-info/METADATA,sha256=IzN7b_dpadQBKpp59jO7VfWzgQfJrF8ykGLo7epNMeY,8719
235
- unstructured_ingest-1.0.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
- unstructured_ingest-1.0.8.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
- unstructured_ingest-1.0.8.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
- unstructured_ingest-1.0.8.dist-info/RECORD,,
234
+ unstructured_ingest-1.0.11.dist-info/METADATA,sha256=qD4WgfLZBApr6D_Zwg_9_1QBXjmKcQgxt-lpIja1liE,8720
235
+ unstructured_ingest-1.0.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
+ unstructured_ingest-1.0.11.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
+ unstructured_ingest-1.0.11.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
+ unstructured_ingest-1.0.11.dist-info/RECORD,,