unstructured-ingest 1.0.4__py3-none-any.whl → 1.0.6.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.4" # pragma: no cover
1
+ __version__ = "1.0.6-dev0" # pragma: no cover
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  from pathlib import Path
3
- from typing import Any, Optional, Union
3
+ from typing import Any, Optional
4
4
  from uuid import NAMESPACE_DNS, uuid5
5
5
 
6
6
  from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
@@ -29,7 +29,7 @@ class FileDataSourceMetadata(BaseModel):
29
29
  date_created: Optional[str] = None
30
30
  date_modified: Optional[str] = None
31
31
  date_processed: Optional[str] = None
32
- permissions_data: Union[list[dict[str, Any]], dict[str, Any], None] = None
32
+ permissions_data: Optional[list[dict[str, Any]]] = None
33
33
  filesize_bytes: Optional[int] = None
34
34
 
35
35
 
@@ -352,6 +352,7 @@ class ConfluenceDownloader(Downloader):
352
352
  def _get_permissions_for_space(self, space_id: int) -> Optional[List[dict]]:
353
353
  if space_id in self._permissions_cache:
354
354
  self._permissions_cache.move_to_end(space_id) # mark recent use
355
+ logger.debug(f"Retrieved cached permissions for space {space_id}")
355
356
  return self._permissions_cache[space_id]
356
357
  else:
357
358
  with self.connection_config.get_client() as client:
@@ -371,22 +372,27 @@ class ConfluenceDownloader(Downloader):
371
372
  self._permissions_cache.popitem(last=False) # LRU/FIFO eviction
372
373
  self._permissions_cache[space_id] = space_permissions
373
374
 
375
+ logger.debug(f"Retrieved permissions for space {space_id}")
374
376
  return space_permissions
375
377
  except Exception as e:
376
378
  logger.debug(f"Could not retrieve permissions for space {space_id}: {e}")
377
379
  return None
378
380
 
379
- def _parse_permissions_for_doc(self, doc_id: str, space_permissions: list) -> Optional[dict]:
381
+ def _parse_permissions_for_doc(
382
+ self, doc_id: str, space_permissions: list
383
+ ) -> Optional[list[dict]]:
380
384
  with self.connection_config.get_client() as client:
381
385
  try:
382
386
  doc_permissions = client.get_all_restrictions_for_content(content_id=doc_id)
383
387
  parsed_permissions_dict = self.parse_permissions(doc_permissions, space_permissions)
388
+ parsed_permissions_dict = [{k: v} for k, v in parsed_permissions_dict.items()]
384
389
 
385
390
  except Exception as e:
386
391
  # skip writing any permission metadata
387
392
  logger.debug(f"Could not retrieve permissions for doc {doc_id}: {e}")
388
393
  return None
389
394
 
395
+ logger.debug(f"normalized permissions generated: {parsed_permissions_dict}")
390
396
  return parsed_permissions_dict
391
397
 
392
398
  def run(self, file_data: FileData, **kwargs) -> download_responses:
@@ -54,7 +54,8 @@ class GoogleDriveAccessConfig(AccessConfig):
54
54
  default=None, description="Credentials values to use for authentication"
55
55
  )
56
56
  service_account_key_path: Optional[Path] = Field(
57
- default=None, description="File path to credentials values to use for authentication"
57
+ default=None,
58
+ description="File path to credentials values to use for authentication",
58
59
  )
59
60
 
60
61
  def model_post_init(self, __context: Any) -> None:
@@ -111,10 +112,9 @@ class GoogleDriveIndexerConfig(IndexerConfig):
111
112
  extensions: Optional[list[str]] = None
112
113
  recursive: bool = False
113
114
 
114
- def __post_init__(self):
115
- # Strip leading period of extension
115
+ def model_post_init(self, __context: Any) -> None:
116
116
  if self.extensions is not None:
117
- self.extensions = [e[1:] if e.startswith(".") else e for e in self.extensions]
117
+ self.extensions = [e.lstrip(".") for e in self.extensions]
118
118
 
119
119
 
120
120
  @dataclass
@@ -275,7 +275,8 @@ class GoogleDriveIndexer(Indexer):
275
275
 
276
276
  except Exception as e:
277
277
  logger.error(
278
- "Failed to validate Google Drive connection during precheck", exc_info=True
278
+ "Failed to validate Google Drive connection during precheck",
279
+ exc_info=True,
279
280
  )
280
281
  raise SourceConnectionError(f"Precheck failed: {e}")
281
282
 
@@ -284,17 +285,17 @@ class GoogleDriveIndexer(Indexer):
284
285
  return record.get("mimeType") == "application/vnd.google-apps.folder"
285
286
 
286
287
  @staticmethod
287
- def map_file_data(f: dict) -> FileData:
288
- file_id = f["id"]
289
- filename = f.pop("name")
290
- url = f.pop("webContentLink", None)
291
- version = f.pop("version", None)
292
- permissions = f.pop("permissions", None)
293
- date_created_str = f.pop("createdTime", None)
288
+ def map_file_data(root_info: dict) -> FileData:
289
+ file_id = root_info["id"]
290
+ filename = root_info.pop("name")
291
+ url = root_info.pop("webContentLink", None)
292
+ version = root_info.pop("version", None)
293
+ permissions = root_info.pop("permissions", None)
294
+ date_created_str = root_info.pop("createdTime", None)
294
295
  date_created_dt = parser.parse(date_created_str) if date_created_str else None
295
- date_modified_str = f.pop("modifiedTime", None)
296
- parent_path = f.pop("parent_path", None)
297
- parent_root_path = f.pop("parent_root_path", None)
296
+ date_modified_str = root_info.pop("modifiedTime", None)
297
+ parent_path = root_info.pop("parent_path", None)
298
+ parent_root_path = root_info.pop("parent_root_path", None)
298
299
  date_modified_dt = parser.parse(date_modified_str) if date_modified_str else None
299
300
  if (
300
301
  parent_path
@@ -323,7 +324,7 @@ class GoogleDriveIndexer(Indexer):
323
324
  "file_id": file_id,
324
325
  },
325
326
  ),
326
- additional_metadata=f,
327
+ additional_metadata=root_info,
327
328
  )
328
329
 
329
330
  def get_paginated_results(
@@ -404,13 +405,14 @@ class GoogleDriveIndexer(Indexer):
404
405
  data = []
405
406
  for f in file_contents:
406
407
  f["permissions"] = self.extract_permissions(f.get("permissions"))
407
- data.append(self.map_file_data(f=f))
408
+ data.append(self.map_file_data(root_info=f))
408
409
  for d in data:
409
410
  d.metadata.record_locator["drive_id"]: object_id
410
411
  return data
411
412
 
412
- def extract_permissions(self, permissions: list[dict]) -> dict:
413
+ def extract_permissions(self, permissions: Optional[list[dict]]) -> list[dict]:
413
414
  if not permissions:
415
+ logger.debug("no permissions found")
414
416
  return {}
415
417
 
416
418
  # https://developers.google.com/workspace/drive/api/guides/ref-roles
@@ -442,7 +444,8 @@ class GoogleDriveIndexer(Indexer):
442
444
  for key in role_dict:
443
445
  role_dict[key] = sorted(role_dict[key])
444
446
 
445
- return normalized_permissions
447
+ logger.debug(f"normalized permissions generated: {normalized_permissions}")
448
+ return [{k: v} for k, v in normalized_permissions.items()]
446
449
 
447
450
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
448
451
  with self.connection_config.get_client() as client:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.4
3
+ Version: 1.0.6.dev0
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=MizK8W2VY6aXUudG1jVogTj7GJ2uwduw5iryFPwi0tM,42
2
+ unstructured_ingest/__version__.py,sha256=f9THPBYTYgzAGHqOPz3K_-VIVaQAVKMJgCpBy7w0j_k,47
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -19,7 +19,7 @@ unstructured_ingest/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
19
19
  unstructured_ingest/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFBIEBe4A_n7mH4,7827
20
20
  unstructured_ingest/cli/utils/model_conversion.py,sha256=hMjAfOVvO1RXTDsw26mmersdncvddkb_rP9JTEgVVCw,7649
21
21
  unstructured_ingest/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- unstructured_ingest/data_types/file_data.py,sha256=E-09hkI4ms4yj-g_aQPIrnm0kbiZLwukCnbwp6OpobQ,3859
22
+ unstructured_ingest/data_types/file_data.py,sha256=J0RQa7YXhhxiLVzhPbF5Hl2nzSpxLFK9vrP6RTBWlSg,3833
23
23
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  unstructured_ingest/embed/azure_openai.py,sha256=_-I-nwd-wdCiKkSdYBL4UKrTZ2UPWsM_0T69fcObs_I,1707
25
25
  unstructured_ingest/embed/bedrock.py,sha256=t58V_QQjWPO62CTuP0aLFMDisPeXpxG2xSFGUhN-JvI,7726
@@ -65,13 +65,13 @@ unstructured_ingest/processes/connectors/airtable.py,sha256=smx5qBSUKwM8V6Xcc7ik
65
65
  unstructured_ingest/processes/connectors/astradb.py,sha256=Ob9wQgDxa6BXDPZBOqooNKQgvjIZcMwIe4fW3VlI7h8,18929
66
66
  unstructured_ingest/processes/connectors/azure_ai_search.py,sha256=szhSRXzUHk0DE2hGFfjGc_jNFzlUwiRlCtIkuu7tmnk,11524
67
67
  unstructured_ingest/processes/connectors/chroma.py,sha256=q5_Fu4xb6_W_NyrPxVa3-jVwZLqVdlBNlR4dFvbd7l0,7235
68
- unstructured_ingest/processes/connectors/confluence.py,sha256=7uRgmpX3NcVzA2V7VcngzjMQ69pS0J2wu6cbMp7AFA0,20739
68
+ unstructured_ingest/processes/connectors/confluence.py,sha256=1oT4A83jSOWR8u8kldHImOBqSLxctdlsR-AZpzJfO9w,21098
69
69
  unstructured_ingest/processes/connectors/couchbase.py,sha256=KCHoYDNya9B05NIB5D78zXoizFyfpJRepcYBe1nLSOs,12298
70
70
  unstructured_ingest/processes/connectors/delta_table.py,sha256=2DFox_Vzoopt_D3Jy3rCjrrTGMutG2INIrwCeoIohRY,7340
71
71
  unstructured_ingest/processes/connectors/discord.py,sha256=6yEJ_agfKUqsV43wFsbMkcd8lcLJC0uqbo4izjdZ3rU,5294
72
72
  unstructured_ingest/processes/connectors/github.py,sha256=smHCz6jOH1p_hW2S25bYunBBj_pYjz8HTw6wkzaJz_A,7765
73
73
  unstructured_ingest/processes/connectors/gitlab.py,sha256=6h1CdqznJmzeWxGfXrFLdNdT23PExGnUMMX7usK_4Kk,10013
74
- unstructured_ingest/processes/connectors/google_drive.py,sha256=mcplAPbQ_A_MIsIXWc7K0YtEXMIMmluefsrzddJQNFw,21674
74
+ unstructured_ingest/processes/connectors/google_drive.py,sha256=YMuobb9AZq-0pp6rbYyXNizs3jA6z3nQjO9cK_nncXQ,21936
75
75
  unstructured_ingest/processes/connectors/jira.py,sha256=eG8yTn8ZVEz7rBJ-ha8i_d9hEh6VALN6QJT_vbYvbL0,17142
76
76
  unstructured_ingest/processes/connectors/kdbai.py,sha256=XhxYpKSAoFPBsDQWwNuLX03DCxOVr7yquj9VYM55Rtc,5174
77
77
  unstructured_ingest/processes/connectors/local.py,sha256=LluTLKv4g7FbJb4A6vuSxI9VhzKZuuQUpDS-cVNAQ2g,7426
@@ -230,8 +230,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
230
230
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
231
231
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
232
232
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
233
- unstructured_ingest-1.0.4.dist-info/METADATA,sha256=ZrV3WL4OOzjU53IKTL59o3dr5UIDRrSGewp-tFGfSF8,8719
234
- unstructured_ingest-1.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
235
- unstructured_ingest-1.0.4.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
236
- unstructured_ingest-1.0.4.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
237
- unstructured_ingest-1.0.4.dist-info/RECORD,,
233
+ unstructured_ingest-1.0.6.dev0.dist-info/METADATA,sha256=2A8V0IxykBKTDpalgDOmkRw98MoN6whUPI0DwHkBuBc,8724
234
+ unstructured_ingest-1.0.6.dev0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
235
+ unstructured_ingest-1.0.6.dev0.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
236
+ unstructured_ingest-1.0.6.dev0.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
237
+ unstructured_ingest-1.0.6.dev0.dist-info/RECORD,,