unstructured-ingest 1.0.34__py3-none-any.whl → 1.0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "1.0.34" # pragma: no cover
1
+ __version__ = "1.0.37" # pragma: no cover
@@ -196,9 +196,10 @@ class DatabricksVolumesUploader(Uploader, ABC):
196
196
  connection_config: DatabricksVolumesConnectionConfig
197
197
 
198
198
  def get_output_path(self, file_data: FileData) -> str:
199
- if file_data.source_identifiers.fullpath:
199
+ if file_data.source_identifiers.relative_path:
200
200
  return os.path.join(
201
- self.upload_config.path, f"{file_data.source_identifiers.fullpath}.json"
201
+ self.upload_config.path,
202
+ f"{file_data.source_identifiers.relative_path.lstrip('/')}.json",
202
203
  )
203
204
  else:
204
205
  return os.path.join(
@@ -345,7 +345,7 @@ class FsspecUploader(Uploader):
345
345
  def get_upload_path(self, file_data: FileData) -> Path:
346
346
  upload_path = Path(
347
347
  self.upload_config.path_without_protocol
348
- ) / file_data.source_identifiers.fullpath.lstrip("/")
348
+ ) / file_data.source_identifiers.relative_path.lstrip("/")
349
349
  updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
350
350
  return updated_upload_path
351
351
 
@@ -18,6 +18,12 @@ class OriginalSyncedBlock(BlockBase):
18
18
 
19
19
  @classmethod
20
20
  def from_dict(cls, data: dict):
21
+ """Create OriginalSyncedBlock from dictionary data.
22
+
23
+ Original blocks contain children content.
24
+ """
25
+ if "children" not in data:
26
+ raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
21
27
  return cls(children=data["children"])
22
28
 
23
29
  def get_html(self) -> Optional[HtmlTag]:
@@ -31,27 +37,74 @@ class DuplicateSyncedBlock(BlockBase):
31
37
 
32
38
  @staticmethod
33
39
  def can_have_children() -> bool:
40
+ """Check if duplicate synced blocks can have children.
41
+
42
+ Duplicate blocks themselves don't have children directly fetched here,
43
+ but they represent content that does, so Notion API might report has_children=True
44
+ on the parent block object. The actual children are fetched from the original block.
45
+ """
34
46
  return True
35
47
 
36
48
  @classmethod
37
49
  def from_dict(cls, data: dict):
38
- return cls(**data)
50
+ """Create DuplicateSyncedBlock from dictionary data.
51
+
52
+ Duplicate blocks contain a 'synced_from' reference.
53
+ """
54
+ synced_from_data = data.get("synced_from")
55
+ if not synced_from_data or not isinstance(synced_from_data, dict):
56
+ raise ValueError(f"Invalid data structure for DuplicateSyncedBlock: {data}")
57
+ # Ensure required keys are present in the nested dictionary
58
+ if "type" not in synced_from_data or "block_id" not in synced_from_data:
59
+ raise ValueError(
60
+ f"Missing 'type' or 'block_id' in synced_from data: {synced_from_data}"
61
+ )
62
+ return cls(type=synced_from_data["type"], block_id=synced_from_data["block_id"])
39
63
 
40
64
  def get_html(self) -> Optional[HtmlTag]:
65
+ """Get HTML representation of the duplicate synced block.
66
+
67
+ HTML representation might need fetching the original block's content,
68
+ which is outside the scope of this simple data class.
69
+ """
41
70
  return None
42
71
 
43
72
 
44
73
  class SyncBlock(BlockBase):
45
74
  @staticmethod
46
75
  def can_have_children() -> bool:
76
+ """Check if synced blocks can have children.
77
+
78
+ Synced blocks (both original and duplicate) can conceptually have children.
79
+ """
47
80
  return True
48
81
 
49
82
  @classmethod
50
83
  def from_dict(cls, data: dict):
51
- if "synced_from" in data:
84
+ """Create appropriate SyncedBlock subclass from dictionary data.
85
+
86
+ Determine if it's a duplicate (has 'synced_from') or original (has 'children').
87
+ """
88
+ if data.get("synced_from") is not None:
89
+ # It's a duplicate block containing a reference
90
+ return DuplicateSyncedBlock.from_dict(data)
91
+ elif "children" in data:
92
+ # It's an original block containing children
52
93
  return OriginalSyncedBlock.from_dict(data)
53
94
  else:
54
- return DuplicateSyncedBlock.from_dict(data)
95
+ # Handle cases where neither 'synced_from' nor 'children' are present.
96
+ # Notion API might return this for an empty original synced block.
97
+ # Let's treat it as an empty OriginalSyncedBlock.
98
+ # If this assumption is wrong, errors might occur later.
99
+ # Consider logging a warning here if strictness is needed.
100
+ return OriginalSyncedBlock(children=[])
101
+
55
102
 
56
103
  def get_html(self) -> Optional[HtmlTag]:
104
+ """Get HTML representation of the synced block.
105
+
106
+ The specific instance returned by from_dict (Original or Duplicate)
107
+ will handle its own get_html logic.
108
+ This method on the base SyncBlock might not be directly called.
109
+ """
57
110
  return None
@@ -369,15 +369,14 @@ class OnedriveUploader(Uploader):
369
369
 
370
370
  # Use the remote_url from upload_config as the base destination folder
371
371
  base_destination_folder = self.upload_config.url
372
-
373
- # Use the file's full path to maintain directory structure, if needed
374
- if file_data.source_identifiers and file_data.source_identifiers.fullpath:
375
- # Combine the base destination folder with the file's full path
372
+ # Use the file's relative path to maintain directory structure, if needed
373
+ if file_data.source_identifiers and file_data.source_identifiers.relative_path:
374
+ # Combine the base destination folder with the file's relative path
376
375
  destination_path = Path(base_destination_folder) / Path(
377
- f"{file_data.source_identifiers.fullpath}.json"
376
+ f"{file_data.source_identifiers.relative_path}.json"
378
377
  )
379
378
  else:
380
- # If no full path is provided, upload directly to the base destination folder
379
+ # If no relative path is provided, upload directly to the base destination folder
381
380
  destination_path = Path(base_destination_folder) / f"{path.name}.json"
382
381
 
383
382
  destination_folder = destination_path.parent
@@ -29,6 +29,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
29
29
 
30
30
  if TYPE_CHECKING:
31
31
  from office365.onedrive.driveitems.driveItem import DriveItem
32
+ from office365.onedrive.sites.site import Site
32
33
 
33
34
  CONNECTOR_TYPE = "sharepoint"
34
35
  LEGACY_DEFAULT_PATH = "Shared Documents"
@@ -51,6 +52,33 @@ class SharepointConnectionConfig(OnedriveConnectionConfig):
51
52
  https://[tenant]-admin.sharepoint.com.\
52
53
  This requires the app to be registered at a tenant level"
53
54
  )
55
+ library: Optional[str] = Field(
56
+ default=None,
57
+ description="Sharepoint library name. If not provided, the default \
58
+ drive will be used.",
59
+ )
60
+
61
+ def _get_drive_item(self, client_site: Site) -> DriveItem:
62
+ """Helper method to get the drive item for the specified library or default drive."""
63
+ site_drive_item = None
64
+ if self.library:
65
+ for drive in client_site.drives.get().execute_query():
66
+ if drive.name == self.library:
67
+ logger.info(f"Found the requested library: {self.library}")
68
+ site_drive_item = drive.get().execute_query().root
69
+ break
70
+
71
+ # If no specific library was found or requested, use the default drive
72
+ if not site_drive_item:
73
+ if self.library:
74
+ logger.warning(
75
+ f"Library '{self.library}' not found in site '{self.site}'. "
76
+ "Using the default drive instead."
77
+ )
78
+
79
+ site_drive_item = client_site.drive.get().execute_query().root
80
+
81
+ return site_drive_item
54
82
 
55
83
 
56
84
  class SharepointIndexerConfig(OnedriveIndexerConfig):
@@ -76,8 +104,8 @@ class SharepointIndexer(OnedriveIndexer):
76
104
 
77
105
  client = await asyncio.to_thread(self.connection_config.get_client)
78
106
  try:
79
- site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
80
- site_drive_item = site.drive.get().execute_query().root
107
+ client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
108
+ site_drive_item = self.connection_config._get_drive_item(client_site)
81
109
  except ClientRequestException:
82
110
  logger.info("Site not found")
83
111
 
@@ -118,8 +146,8 @@ class SharepointDownloader(OnedriveDownloader):
118
146
  client = self.connection_config.get_client()
119
147
 
120
148
  try:
121
- site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
122
- site_drive_item = site.drive.get().execute_query().root
149
+ client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
150
+ site_drive_item = self.connection_config._get_drive_item(client_site)
123
151
  except ClientRequestException:
124
152
  logger.info("Site not found")
125
153
  file = site_drive_item.get_by_path(server_relative_path).get().execute_query()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: unstructured_ingest
3
- Version: 1.0.34
3
+ Version: 1.0.37
4
4
  Summary: Local ETL data pipeline to get data RAG ready
5
5
  Author-email: Unstructured Technologies <devops@unstructuredai.io>
6
6
  License-Expression: Apache-2.0
@@ -1,5 +1,5 @@
1
1
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
2
- unstructured_ingest/__version__.py,sha256=S3Vgmk2V2EWfbef_sUbnJb_d5x0m64Z8D_xx-_9kXOM,43
2
+ unstructured_ingest/__version__.py,sha256=De73lzt6X-hjX65lK6tF1Rs23QRJQqTCx5Zn-JyPtFI,43
3
3
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
4
4
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
5
5
  unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
@@ -79,12 +79,12 @@ unstructured_ingest/processes/connectors/local.py,sha256=LluTLKv4g7FbJb4A6vuSxI9
79
79
  unstructured_ingest/processes/connectors/milvus.py,sha256=Jr9cul7By03tGAPFnFBoqncnNWwbhKd-qbmkuqnin8U,8908
80
80
  unstructured_ingest/processes/connectors/mongodb.py,sha256=1g_5bfbS6lah3nsOXqLAanR3zNYJ47_Njw_uV-uj3_U,14324
81
81
  unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
82
- unstructured_ingest/processes/connectors/onedrive.py,sha256=k0bhQCCSIgmHAk3lQd4CMA3dc4fPAjegNlLxlDWGowc,19284
82
+ unstructured_ingest/processes/connectors/onedrive.py,sha256=JIADpc31PI9Yzr0raF6bSqzes2jhfcniUzew1aKVWeI,19305
83
83
  unstructured_ingest/processes/connectors/outlook.py,sha256=zHM5frO7CqQG0-KcTyX49aZeSlsvVrl8kh_lR_ESgQw,9275
84
84
  unstructured_ingest/processes/connectors/pinecone.py,sha256=pSREUNsQqel6q1EFZsFWelg-uZgGubQY5m_6nVnBFKs,15090
85
85
  unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
86
86
  unstructured_ingest/processes/connectors/salesforce.py,sha256=OaKEWCqZrirHqFJ650K5jSPwYlWefPOapas8Y-4D9oc,11661
87
- unstructured_ingest/processes/connectors/sharepoint.py,sha256=jI-erp4YUfHxPeUTcfHSPEG3w0wjSBYfAnMg1WT6lfw,4996
87
+ unstructured_ingest/processes/connectors/sharepoint.py,sha256=vIfLIactYXcdetccHvKlYOay6NOzGj2X0CkXbY0KuRo,6213
88
88
  unstructured_ingest/processes/connectors/slack.py,sha256=EkFj9PcAu5_gF2xLogikKDADLbJYq-_jvchzYrTdLO4,9224
89
89
  unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
90
90
  unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
@@ -92,7 +92,7 @@ unstructured_ingest/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-
92
92
  unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=8a9HTcRWA6IuswSD632b_uZSO6Dax_0rUYnflqktcek,226
93
93
  unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
94
94
  unstructured_ingest/processes/connectors/databricks/__init__.py,sha256=RtKAPyNtXh6fzEsOQ08pA0-vC1uMr3KqYG6cqiBoo70,2133
95
- unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=fZeXRozTUM3JeZlmsxhn_glqRhxr8CGG-8I8QRhRcP8,8232
95
+ unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=yT5JFbVzAEOJsKjfGH8KG3eQfKaTNFEsg_FVDPVK7Xs,8271
96
96
  unstructured_ingest/processes/connectors/databricks/volumes_aws.py,sha256=WhGTp6aRTLSdc4GChCL4mz2b-IanderW8j1IqezX6YA,2958
97
97
  unstructured_ingest/processes/connectors/databricks/volumes_azure.py,sha256=pF2d6uAIbwJJUeOIG5xknUMCGc5d9Aztmc2776wp-a0,3740
98
98
  unstructured_ingest/processes/connectors/databricks/volumes_gcp.py,sha256=y9AvVl6PtnIxlTlrPj_wyHBDBRJNq3uoTOuZwTryNg8,2994
@@ -109,7 +109,7 @@ unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W
109
109
  unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
110
110
  unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
111
111
  unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
112
- unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=4K8Q2D_6_HCqTVM3HBJv3SNz9gjbQhk44nzeSheDpzA,14462
112
+ unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=NbId5WMq6M5kF3fYAwSUuaL2e_gutgmTATrE_X8okGY,14467
113
113
  unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
114
114
  unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=2ZV6b2E2pIsf_ab1Lty74FwpMnJZhpQUdamPgpwcKsQ,7141
115
115
  unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
@@ -166,7 +166,7 @@ unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py,sh
166
166
  unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py,sha256=qvc4orjP2XcbaeBWor-a3xAEglLkyb-epknm7SXgU1E,992
167
167
  unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py,sha256=St43RmpefAzDwJKTwz2CdGVm-xeUwHkYgtQtLYQbnw0,1661
168
168
  unstructured_ingest/processes/connectors/notion/types/blocks/quote.py,sha256=yl7npmdcO6oFNgTNGVN_Ihvzexv12Xwg1r4NWAOjILQ,1176
169
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py,sha256=Rc3xyKtnOwovx-O-dzmS9pX0h4-s41YnWmmEz5TYxdU,1333
169
+ unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py,sha256=aHu4yg8N1EDqZmMEHK7dd7fiQ8Mc8otHQLJPRDbkaT8,4049
170
170
  unstructured_ingest/processes/connectors/notion/types/blocks/table.py,sha256=eYUlRp4uCwjy_eB0mLh7MGMe1qrr_hnOxXS5RfUM2DQ,1724
171
171
  unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py,sha256=bR5DdecXFz468okM5WOs10DK8_14Dj7OCLSRusMZzsk,534
172
172
  unstructured_ingest/processes/connectors/notion/types/blocks/template.py,sha256=bq2Vh2X7ptpofs9OZnATHySZe2DzbOLsNNfpEI70NgM,968
@@ -231,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
231
231
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
232
232
  unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
233
233
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
234
- unstructured_ingest-1.0.34.dist-info/METADATA,sha256=Pw-KP4al9gteAFj6lqY7xkFRjWj1rTAgN960UsAUZAM,8747
235
- unstructured_ingest-1.0.34.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
- unstructured_ingest-1.0.34.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
- unstructured_ingest-1.0.34.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
- unstructured_ingest-1.0.34.dist-info/RECORD,,
234
+ unstructured_ingest-1.0.37.dist-info/METADATA,sha256=wct0um6qunVNGSNozJ0a3UatsfCHDyXG7p9XMNBCTcU,8747
235
+ unstructured_ingest-1.0.37.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
236
+ unstructured_ingest-1.0.37.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
237
+ unstructured_ingest-1.0.37.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
238
+ unstructured_ingest-1.0.37.dist-info/RECORD,,