unstructured-ingest 1.0.5__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.5" # pragma: no cover
1
+ __version__ = "1.0.7" # pragma: no cover
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  from pathlib import Path
3
- from typing import Any, Optional, Union
3
+ from typing import Any, Optional
4
4
  from uuid import NAMESPACE_DNS, uuid5
5
5
 
6
6
  from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
@@ -29,31 +29,9 @@ class FileDataSourceMetadata(BaseModel):
29
29
  date_created: Optional[str] = None
30
30
  date_modified: Optional[str] = None
31
31
  date_processed: Optional[str] = None
32
- permissions_data: Union[list[dict[str, Any]], dict[str, Any], None] = None
32
+ permissions_data: Optional[list[dict[str, Any]]] = None
33
33
  filesize_bytes: Optional[int] = None
34
34
 
35
- @field_validator("permissions_data", mode="before")
36
- @classmethod
37
- def coerce_permissions_data(cls, v: Any) -> Any:
38
- if isinstance(v, dict):
39
- # Temporarily convert dict to list for validation
40
- return [v]
41
- return v
42
-
43
- @field_validator("permissions_data", mode="after")
44
- @classmethod
45
- def restore_dict_permissions_data(
46
- cls, v: Optional[list[dict[str, Any]]]
47
- ) -> Union[list[dict[str, Any]], dict[str, Any], None]:
48
- if (
49
- isinstance(v, list)
50
- and len(v) == 1
51
- and isinstance(v[0], dict)
52
- and any(isinstance(val, dict) for val in v[0].values())
53
- ):
54
- return v[0]
55
- return v
56
-
57
35
 
58
36
  class FileData(BaseModel):
59
37
  identifier: str
@@ -378,11 +378,14 @@ class ConfluenceDownloader(Downloader):
378
378
  logger.debug(f"Could not retrieve permissions for space {space_id}: {e}")
379
379
  return None
380
380
 
381
- def _parse_permissions_for_doc(self, doc_id: str, space_permissions: list) -> Optional[dict]:
381
+ def _parse_permissions_for_doc(
382
+ self, doc_id: str, space_permissions: list
383
+ ) -> Optional[list[dict]]:
382
384
  with self.connection_config.get_client() as client:
383
385
  try:
384
386
  doc_permissions = client.get_all_restrictions_for_content(content_id=doc_id)
385
387
  parsed_permissions_dict = self.parse_permissions(doc_permissions, space_permissions)
388
+ parsed_permissions_dict = [{k: v} for k, v in parsed_permissions_dict.items()]
386
389
 
387
390
  except Exception as e:
388
391
  # skip writing any permission metadata
@@ -54,7 +54,8 @@ class GoogleDriveAccessConfig(AccessConfig):
54
54
  default=None, description="Credentials values to use for authentication"
55
55
  )
56
56
  service_account_key_path: Optional[Path] = Field(
57
- default=None, description="File path to credentials values to use for authentication"
57
+ default=None,
58
+ description="File path to credentials values to use for authentication",
58
59
  )
59
60
 
60
61
  def model_post_init(self, __context: Any) -> None:
@@ -111,10 +112,9 @@ class GoogleDriveIndexerConfig(IndexerConfig):
111
112
  extensions: Optional[list[str]] = None
112
113
  recursive: bool = False
113
114
 
114
- def __post_init__(self):
115
- # Strip leading period of extension
115
+ def model_post_init(self, __context: Any) -> None:
116
116
  if self.extensions is not None:
117
- self.extensions = [e[1:] if e.startswith(".") else e for e in self.extensions]
117
+ self.extensions = [e.lstrip(".") for e in self.extensions]
118
118
 
119
119
 
120
120
  @dataclass
@@ -275,7 +275,8 @@ class GoogleDriveIndexer(Indexer):
275
275
 
276
276
  except Exception as e:
277
277
  logger.error(
278
- "Failed to validate Google Drive connection during precheck", exc_info=True
278
+ "Failed to validate Google Drive connection during precheck",
279
+ exc_info=True,
279
280
  )
280
281
  raise SourceConnectionError(f"Precheck failed: {e}")
281
282
 
@@ -284,17 +285,17 @@ class GoogleDriveIndexer(Indexer):
284
285
  return record.get("mimeType") == "application/vnd.google-apps.folder"
285
286
 
286
287
  @staticmethod
287
- def map_file_data(f: dict) -> FileData:
288
- file_id = f["id"]
289
- filename = f.pop("name")
290
- url = f.pop("webContentLink", None)
291
- version = f.pop("version", None)
292
- permissions = f.pop("permissions", None)
293
- date_created_str = f.pop("createdTime", None)
288
+ def map_file_data(root_info: dict) -> FileData:
289
+ file_id = root_info["id"]
290
+ filename = root_info.pop("name")
291
+ url = root_info.pop("webContentLink", None)
292
+ version = root_info.pop("version", None)
293
+ permissions = root_info.pop("permissions", None)
294
+ date_created_str = root_info.pop("createdTime", None)
294
295
  date_created_dt = parser.parse(date_created_str) if date_created_str else None
295
- date_modified_str = f.pop("modifiedTime", None)
296
- parent_path = f.pop("parent_path", None)
297
- parent_root_path = f.pop("parent_root_path", None)
296
+ date_modified_str = root_info.pop("modifiedTime", None)
297
+ parent_path = root_info.pop("parent_path", None)
298
+ parent_root_path = root_info.pop("parent_root_path", None)
298
299
  date_modified_dt = parser.parse(date_modified_str) if date_modified_str else None
299
300
  if (
300
301
  parent_path
@@ -323,7 +324,7 @@ class GoogleDriveIndexer(Indexer):
323
324
  "file_id": file_id,
324
325
  },
325
326
  ),
326
- additional_metadata=f,
327
+ additional_metadata=root_info,
327
328
  )
328
329
 
329
330
  def get_paginated_results(
@@ -404,12 +405,12 @@ class GoogleDriveIndexer(Indexer):
404
405
  data = []
405
406
  for f in file_contents:
406
407
  f["permissions"] = self.extract_permissions(f.get("permissions"))
407
- data.append(self.map_file_data(f=f))
408
+ data.append(self.map_file_data(root_info=f))
408
409
  for d in data:
409
410
  d.metadata.record_locator["drive_id"]: object_id
410
411
  return data
411
412
 
412
- def extract_permissions(self, permissions: Optional[list[dict]]) -> dict:
413
+ def extract_permissions(self, permissions: Optional[list[dict]]) -> list[dict]:
413
414
  if not permissions:
414
415
  logger.debug("no permissions found")
415
416
  return {}
@@ -444,7 +445,7 @@ class GoogleDriveIndexer(Indexer):
444
445
  role_dict[key] = sorted(role_dict[key])
445
446
 
446
447
  logger.debug(f"normalized permissions generated: {normalized_permissions}")
447
- return normalized_permissions
448
+ return [{k: v} for k, v in normalized_permissions.items()]
448
449
 
449
450
  def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
450
451
  with self.connection_config.get_client() as client:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.5
3
+ Version: 1.0.7
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=64jEAZw6WkJoKXj1K79a59MykYVOTTwLRMWIdobOAH0,42
2
+ unstructured_ingest/__version__.py,sha256=v9ynVBAWZqfN0IY9kPMhBdUR0cs_p-L4iuNS6ua_3cc,42
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -19,7 +19,7 @@ unstructured_ingest/cli/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
19
19
  unstructured_ingest/cli/utils/click.py,sha256=1_eJgrwS2DFBl1jZPLsj1vgVgR7agFBIEBe4A_n7mH4,7827
20
20
  unstructured_ingest/cli/utils/model_conversion.py,sha256=hMjAfOVvO1RXTDsw26mmersdncvddkb_rP9JTEgVVCw,7649
21
21
  unstructured_ingest/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- unstructured_ingest/data_types/file_data.py,sha256=7JwwbcgVQdIwCKxrDLUYvJp1f-bzaiGQD8ETr-Ywph8,4571
22
+ unstructured_ingest/data_types/file_data.py,sha256=J0RQa7YXhhxiLVzhPbF5Hl2nzSpxLFK9vrP6RTBWlSg,3833
23
23
  unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  unstructured_ingest/embed/azure_openai.py,sha256=_-I-nwd-wdCiKkSdYBL4UKrTZ2UPWsM_0T69fcObs_I,1707
25
25
  unstructured_ingest/embed/bedrock.py,sha256=t58V_QQjWPO62CTuP0aLFMDisPeXpxG2xSFGUhN-JvI,7726
@@ -65,13 +65,13 @@ unstructured_ingest/processes/connectors/airtable.py,sha256=smx5qBSUKwM8V6Xcc7ik
65
65
  unstructured_ingest/processes/connectors/astradb.py,sha256=Ob9wQgDxa6BXDPZBOqooNKQgvjIZcMwIe4fW3VlI7h8,18929
66
66
  unstructured_ingest/processes/connectors/azure_ai_search.py,sha256=szhSRXzUHk0DE2hGFfjGc_jNFzlUwiRlCtIkuu7tmnk,11524
67
67
  unstructured_ingest/processes/connectors/chroma.py,sha256=q5_Fu4xb6_W_NyrPxVa3-jVwZLqVdlBNlR4dFvbd7l0,7235
68
- unstructured_ingest/processes/connectors/confluence.py,sha256=VAHGs_8HPYgBN8s8YwM7-LdzQ5MI_UEWXcMAMdpWLYk,20983
68
+ unstructured_ingest/processes/connectors/confluence.py,sha256=1oT4A83jSOWR8u8kldHImOBqSLxctdlsR-AZpzJfO9w,21098
69
69
  unstructured_ingest/processes/connectors/couchbase.py,sha256=KCHoYDNya9B05NIB5D78zXoizFyfpJRepcYBe1nLSOs,12298
70
70
  unstructured_ingest/processes/connectors/delta_table.py,sha256=2DFox_Vzoopt_D3Jy3rCjrrTGMutG2INIrwCeoIohRY,7340
71
71
  unstructured_ingest/processes/connectors/discord.py,sha256=6yEJ_agfKUqsV43wFsbMkcd8lcLJC0uqbo4izjdZ3rU,5294
72
72
  unstructured_ingest/processes/connectors/github.py,sha256=smHCz6jOH1p_hW2S25bYunBBj_pYjz8HTw6wkzaJz_A,7765
73
73
  unstructured_ingest/processes/connectors/gitlab.py,sha256=6h1CdqznJmzeWxGfXrFLdNdT23PExGnUMMX7usK_4Kk,10013
74
- unstructured_ingest/processes/connectors/google_drive.py,sha256=Nu6AA0yDCrtoSq5hqvpKJFNRFF0JcxHjZtDVbLay33Q,21817
74
+ unstructured_ingest/processes/connectors/google_drive.py,sha256=YMuobb9AZq-0pp6rbYyXNizs3jA6z3nQjO9cK_nncXQ,21936
75
75
  unstructured_ingest/processes/connectors/jira.py,sha256=eG8yTn8ZVEz7rBJ-ha8i_d9hEh6VALN6QJT_vbYvbL0,17142
76
76
  unstructured_ingest/processes/connectors/kdbai.py,sha256=XhxYpKSAoFPBsDQWwNuLX03DCxOVr7yquj9VYM55Rtc,5174
77
77
  unstructured_ingest/processes/connectors/local.py,sha256=LluTLKv4g7FbJb4A6vuSxI9VhzKZuuQUpDS-cVNAQ2g,7426
@@ -230,8 +230,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
230
230
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
231
231
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
232
232
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
233
- unstructured_ingest-1.0.5.dist-info/METADATA,sha256=D8wUuNkaBZMshLsm-S5kcLGgsJOv-xO6naAFJM2eVqI,8719
234
- unstructured_ingest-1.0.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
235
- unstructured_ingest-1.0.5.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
236
- unstructured_ingest-1.0.5.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
237
- unstructured_ingest-1.0.5.dist-info/RECORD,,
233
+ unstructured_ingest-1.0.7.dist-info/METADATA,sha256=mBois5a4uC3ZU_qyz0wHL906E45SwYPMcPTKg8joPWU,8719
234
+ unstructured_ingest-1.0.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
235
+ unstructured_ingest-1.0.7.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
236
+ unstructured_ingest-1.0.7.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
237
+ unstructured_ingest-1.0.7.dist-info/RECORD,,