unstructured-ingest 1.0.34__py3-none-any.whl → 1.0.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes.py +3 -2
- unstructured_ingest/processes/connectors/fsspec/fsspec.py +1 -1
- unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py +56 -3
- unstructured_ingest/processes/connectors/onedrive.py +5 -6
- unstructured_ingest/processes/connectors/sharepoint.py +32 -4
- {unstructured_ingest-1.0.34.dist-info → unstructured_ingest-1.0.37.dist-info}/METADATA +1 -1
- {unstructured_ingest-1.0.34.dist-info → unstructured_ingest-1.0.37.dist-info}/RECORD +11 -11
- {unstructured_ingest-1.0.34.dist-info → unstructured_ingest-1.0.37.dist-info}/WHEEL +0 -0
- {unstructured_ingest-1.0.34.dist-info → unstructured_ingest-1.0.37.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-1.0.34.dist-info → unstructured_ingest-1.0.37.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.0.
|
|
1
|
+
__version__ = "1.0.37" # pragma: no cover
|
|
@@ -196,9 +196,10 @@ class DatabricksVolumesUploader(Uploader, ABC):
|
|
|
196
196
|
connection_config: DatabricksVolumesConnectionConfig
|
|
197
197
|
|
|
198
198
|
def get_output_path(self, file_data: FileData) -> str:
|
|
199
|
-
if file_data.source_identifiers.
|
|
199
|
+
if file_data.source_identifiers.relative_path:
|
|
200
200
|
return os.path.join(
|
|
201
|
-
self.upload_config.path,
|
|
201
|
+
self.upload_config.path,
|
|
202
|
+
f"{file_data.source_identifiers.relative_path.lstrip('/')}.json",
|
|
202
203
|
)
|
|
203
204
|
else:
|
|
204
205
|
return os.path.join(
|
|
@@ -345,7 +345,7 @@ class FsspecUploader(Uploader):
|
|
|
345
345
|
def get_upload_path(self, file_data: FileData) -> Path:
|
|
346
346
|
upload_path = Path(
|
|
347
347
|
self.upload_config.path_without_protocol
|
|
348
|
-
) / file_data.source_identifiers.
|
|
348
|
+
) / file_data.source_identifiers.relative_path.lstrip("/")
|
|
349
349
|
updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
|
|
350
350
|
return updated_upload_path
|
|
351
351
|
|
|
@@ -18,6 +18,12 @@ class OriginalSyncedBlock(BlockBase):
|
|
|
18
18
|
|
|
19
19
|
@classmethod
|
|
20
20
|
def from_dict(cls, data: dict):
|
|
21
|
+
"""Create OriginalSyncedBlock from dictionary data.
|
|
22
|
+
|
|
23
|
+
Original blocks contain children content.
|
|
24
|
+
"""
|
|
25
|
+
if "children" not in data:
|
|
26
|
+
raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
|
|
21
27
|
return cls(children=data["children"])
|
|
22
28
|
|
|
23
29
|
def get_html(self) -> Optional[HtmlTag]:
|
|
@@ -31,27 +37,74 @@ class DuplicateSyncedBlock(BlockBase):
|
|
|
31
37
|
|
|
32
38
|
@staticmethod
|
|
33
39
|
def can_have_children() -> bool:
|
|
40
|
+
"""Check if duplicate synced blocks can have children.
|
|
41
|
+
|
|
42
|
+
Duplicate blocks themselves don't have children directly fetched here,
|
|
43
|
+
but they represent content that does, so Notion API might report has_children=True
|
|
44
|
+
on the parent block object. The actual children are fetched from the original block.
|
|
45
|
+
"""
|
|
34
46
|
return True
|
|
35
47
|
|
|
36
48
|
@classmethod
|
|
37
49
|
def from_dict(cls, data: dict):
|
|
38
|
-
|
|
50
|
+
"""Create DuplicateSyncedBlock from dictionary data.
|
|
51
|
+
|
|
52
|
+
Duplicate blocks contain a 'synced_from' reference.
|
|
53
|
+
"""
|
|
54
|
+
synced_from_data = data.get("synced_from")
|
|
55
|
+
if not synced_from_data or not isinstance(synced_from_data, dict):
|
|
56
|
+
raise ValueError(f"Invalid data structure for DuplicateSyncedBlock: {data}")
|
|
57
|
+
# Ensure required keys are present in the nested dictionary
|
|
58
|
+
if "type" not in synced_from_data or "block_id" not in synced_from_data:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"Missing 'type' or 'block_id' in synced_from data: {synced_from_data}"
|
|
61
|
+
)
|
|
62
|
+
return cls(type=synced_from_data["type"], block_id=synced_from_data["block_id"])
|
|
39
63
|
|
|
40
64
|
def get_html(self) -> Optional[HtmlTag]:
|
|
65
|
+
"""Get HTML representation of the duplicate synced block.
|
|
66
|
+
|
|
67
|
+
HTML representation might need fetching the original block's content,
|
|
68
|
+
which is outside the scope of this simple data class.
|
|
69
|
+
"""
|
|
41
70
|
return None
|
|
42
71
|
|
|
43
72
|
|
|
44
73
|
class SyncBlock(BlockBase):
|
|
45
74
|
@staticmethod
|
|
46
75
|
def can_have_children() -> bool:
|
|
76
|
+
"""Check if synced blocks can have children.
|
|
77
|
+
|
|
78
|
+
Synced blocks (both original and duplicate) can conceptually have children.
|
|
79
|
+
"""
|
|
47
80
|
return True
|
|
48
81
|
|
|
49
82
|
@classmethod
|
|
50
83
|
def from_dict(cls, data: dict):
|
|
51
|
-
|
|
84
|
+
"""Create appropriate SyncedBlock subclass from dictionary data.
|
|
85
|
+
|
|
86
|
+
Determine if it's a duplicate (has 'synced_from') or original (has 'children').
|
|
87
|
+
"""
|
|
88
|
+
if data.get("synced_from") is not None:
|
|
89
|
+
# It's a duplicate block containing a reference
|
|
90
|
+
return DuplicateSyncedBlock.from_dict(data)
|
|
91
|
+
elif "children" in data:
|
|
92
|
+
# It's an original block containing children
|
|
52
93
|
return OriginalSyncedBlock.from_dict(data)
|
|
53
94
|
else:
|
|
54
|
-
|
|
95
|
+
# Handle cases where neither 'synced_from' nor 'children' are present.
|
|
96
|
+
# Notion API might return this for an empty original synced block.
|
|
97
|
+
# Let's treat it as an empty OriginalSyncedBlock.
|
|
98
|
+
# If this assumption is wrong, errors might occur later.
|
|
99
|
+
# Consider logging a warning here if strictness is needed.
|
|
100
|
+
return OriginalSyncedBlock(children=[])
|
|
101
|
+
|
|
55
102
|
|
|
56
103
|
def get_html(self) -> Optional[HtmlTag]:
|
|
104
|
+
"""Get HTML representation of the synced block.
|
|
105
|
+
|
|
106
|
+
The specific instance returned by from_dict (Original or Duplicate)
|
|
107
|
+
will handle its own get_html logic.
|
|
108
|
+
This method on the base SyncBlock might not be directly called.
|
|
109
|
+
"""
|
|
57
110
|
return None
|
|
@@ -369,15 +369,14 @@ class OnedriveUploader(Uploader):
|
|
|
369
369
|
|
|
370
370
|
# Use the remote_url from upload_config as the base destination folder
|
|
371
371
|
base_destination_folder = self.upload_config.url
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
# Combine the base destination folder with the file's full path
|
|
372
|
+
# Use the file's relative path to maintain directory structure, if needed
|
|
373
|
+
if file_data.source_identifiers and file_data.source_identifiers.relative_path:
|
|
374
|
+
# Combine the base destination folder with the file's relative path
|
|
376
375
|
destination_path = Path(base_destination_folder) / Path(
|
|
377
|
-
f"{file_data.source_identifiers.
|
|
376
|
+
f"{file_data.source_identifiers.relative_path}.json"
|
|
378
377
|
)
|
|
379
378
|
else:
|
|
380
|
-
# If no
|
|
379
|
+
# If no relative path is provided, upload directly to the base destination folder
|
|
381
380
|
destination_path = Path(base_destination_folder) / f"{path.name}.json"
|
|
382
381
|
|
|
383
382
|
destination_folder = destination_path.parent
|
|
@@ -29,6 +29,7 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
32
|
+
from office365.onedrive.sites.site import Site
|
|
32
33
|
|
|
33
34
|
CONNECTOR_TYPE = "sharepoint"
|
|
34
35
|
LEGACY_DEFAULT_PATH = "Shared Documents"
|
|
@@ -51,6 +52,33 @@ class SharepointConnectionConfig(OnedriveConnectionConfig):
|
|
|
51
52
|
https://[tenant]-admin.sharepoint.com.\
|
|
52
53
|
This requires the app to be registered at a tenant level"
|
|
53
54
|
)
|
|
55
|
+
library: Optional[str] = Field(
|
|
56
|
+
default=None,
|
|
57
|
+
description="Sharepoint library name. If not provided, the default \
|
|
58
|
+
drive will be used.",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def _get_drive_item(self, client_site: Site) -> DriveItem:
|
|
62
|
+
"""Helper method to get the drive item for the specified library or default drive."""
|
|
63
|
+
site_drive_item = None
|
|
64
|
+
if self.library:
|
|
65
|
+
for drive in client_site.drives.get().execute_query():
|
|
66
|
+
if drive.name == self.library:
|
|
67
|
+
logger.info(f"Found the requested library: {self.library}")
|
|
68
|
+
site_drive_item = drive.get().execute_query().root
|
|
69
|
+
break
|
|
70
|
+
|
|
71
|
+
# If no specific library was found or requested, use the default drive
|
|
72
|
+
if not site_drive_item:
|
|
73
|
+
if self.library:
|
|
74
|
+
logger.warning(
|
|
75
|
+
f"Library '{self.library}' not found in site '{self.site}'. "
|
|
76
|
+
"Using the default drive instead."
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
site_drive_item = client_site.drive.get().execute_query().root
|
|
80
|
+
|
|
81
|
+
return site_drive_item
|
|
54
82
|
|
|
55
83
|
|
|
56
84
|
class SharepointIndexerConfig(OnedriveIndexerConfig):
|
|
@@ -76,8 +104,8 @@ class SharepointIndexer(OnedriveIndexer):
|
|
|
76
104
|
|
|
77
105
|
client = await asyncio.to_thread(self.connection_config.get_client)
|
|
78
106
|
try:
|
|
79
|
-
|
|
80
|
-
site_drive_item =
|
|
107
|
+
client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
108
|
+
site_drive_item = self.connection_config._get_drive_item(client_site)
|
|
81
109
|
except ClientRequestException:
|
|
82
110
|
logger.info("Site not found")
|
|
83
111
|
|
|
@@ -118,8 +146,8 @@ class SharepointDownloader(OnedriveDownloader):
|
|
|
118
146
|
client = self.connection_config.get_client()
|
|
119
147
|
|
|
120
148
|
try:
|
|
121
|
-
|
|
122
|
-
site_drive_item =
|
|
149
|
+
client_site = client.sites.get_by_url(self.connection_config.site).get().execute_query()
|
|
150
|
+
site_drive_item = self.connection_config._get_drive_item(client_site)
|
|
123
151
|
except ClientRequestException:
|
|
124
152
|
logger.info("Site not found")
|
|
125
153
|
file = site_drive_item.get_by_path(server_relative_path).get().execute_query()
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=De73lzt6X-hjX65lK6tF1Rs23QRJQqTCx5Zn-JyPtFI,43
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
|
|
5
5
|
unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
|
|
@@ -79,12 +79,12 @@ unstructured_ingest/processes/connectors/local.py,sha256=LluTLKv4g7FbJb4A6vuSxI9
|
|
|
79
79
|
unstructured_ingest/processes/connectors/milvus.py,sha256=Jr9cul7By03tGAPFnFBoqncnNWwbhKd-qbmkuqnin8U,8908
|
|
80
80
|
unstructured_ingest/processes/connectors/mongodb.py,sha256=1g_5bfbS6lah3nsOXqLAanR3zNYJ47_Njw_uV-uj3_U,14324
|
|
81
81
|
unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
|
|
82
|
-
unstructured_ingest/processes/connectors/onedrive.py,sha256=
|
|
82
|
+
unstructured_ingest/processes/connectors/onedrive.py,sha256=JIADpc31PI9Yzr0raF6bSqzes2jhfcniUzew1aKVWeI,19305
|
|
83
83
|
unstructured_ingest/processes/connectors/outlook.py,sha256=zHM5frO7CqQG0-KcTyX49aZeSlsvVrl8kh_lR_ESgQw,9275
|
|
84
84
|
unstructured_ingest/processes/connectors/pinecone.py,sha256=pSREUNsQqel6q1EFZsFWelg-uZgGubQY5m_6nVnBFKs,15090
|
|
85
85
|
unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
|
|
86
86
|
unstructured_ingest/processes/connectors/salesforce.py,sha256=OaKEWCqZrirHqFJ650K5jSPwYlWefPOapas8Y-4D9oc,11661
|
|
87
|
-
unstructured_ingest/processes/connectors/sharepoint.py,sha256=
|
|
87
|
+
unstructured_ingest/processes/connectors/sharepoint.py,sha256=vIfLIactYXcdetccHvKlYOay6NOzGj2X0CkXbY0KuRo,6213
|
|
88
88
|
unstructured_ingest/processes/connectors/slack.py,sha256=EkFj9PcAu5_gF2xLogikKDADLbJYq-_jvchzYrTdLO4,9224
|
|
89
89
|
unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
|
|
90
90
|
unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
|
|
@@ -92,7 +92,7 @@ unstructured_ingest/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-
|
|
|
92
92
|
unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=8a9HTcRWA6IuswSD632b_uZSO6Dax_0rUYnflqktcek,226
|
|
93
93
|
unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
|
|
94
94
|
unstructured_ingest/processes/connectors/databricks/__init__.py,sha256=RtKAPyNtXh6fzEsOQ08pA0-vC1uMr3KqYG6cqiBoo70,2133
|
|
95
|
-
unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=
|
|
95
|
+
unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=yT5JFbVzAEOJsKjfGH8KG3eQfKaTNFEsg_FVDPVK7Xs,8271
|
|
96
96
|
unstructured_ingest/processes/connectors/databricks/volumes_aws.py,sha256=WhGTp6aRTLSdc4GChCL4mz2b-IanderW8j1IqezX6YA,2958
|
|
97
97
|
unstructured_ingest/processes/connectors/databricks/volumes_azure.py,sha256=pF2d6uAIbwJJUeOIG5xknUMCGc5d9Aztmc2776wp-a0,3740
|
|
98
98
|
unstructured_ingest/processes/connectors/databricks/volumes_gcp.py,sha256=y9AvVl6PtnIxlTlrPj_wyHBDBRJNq3uoTOuZwTryNg8,2994
|
|
@@ -109,7 +109,7 @@ unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W
|
|
|
109
109
|
unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
|
|
110
110
|
unstructured_ingest/processes/connectors/fsspec/box.py,sha256=1gLS7xR2vbjgKBrQ4ZpI1fKTsJuIDfXuAzx_a4FzxG4,5873
|
|
111
111
|
unstructured_ingest/processes/connectors/fsspec/dropbox.py,sha256=HwwKjQmjM7yFk9Esh_F20xDisRPXGUkFduzaasByRDE,8355
|
|
112
|
-
unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=
|
|
112
|
+
unstructured_ingest/processes/connectors/fsspec/fsspec.py,sha256=NbId5WMq6M5kF3fYAwSUuaL2e_gutgmTATrE_X8okGY,14467
|
|
113
113
|
unstructured_ingest/processes/connectors/fsspec/gcs.py,sha256=ouxISCKpZTAj3T6pWGYbASu93wytJjl5WSICvQcrgfE,7172
|
|
114
114
|
unstructured_ingest/processes/connectors/fsspec/s3.py,sha256=2ZV6b2E2pIsf_ab1Lty74FwpMnJZhpQUdamPgpwcKsQ,7141
|
|
115
115
|
unstructured_ingest/processes/connectors/fsspec/sftp.py,sha256=pR_a2SgLjt8ffNkariHrPB1E0HVSTj5h3pt7KxTU3TI,6371
|
|
@@ -166,7 +166,7 @@ unstructured_ingest/processes/connectors/notion/types/blocks/numbered_list.py,sh
|
|
|
166
166
|
unstructured_ingest/processes/connectors/notion/types/blocks/paragraph.py,sha256=qvc4orjP2XcbaeBWor-a3xAEglLkyb-epknm7SXgU1E,992
|
|
167
167
|
unstructured_ingest/processes/connectors/notion/types/blocks/pdf.py,sha256=St43RmpefAzDwJKTwz2CdGVm-xeUwHkYgtQtLYQbnw0,1661
|
|
168
168
|
unstructured_ingest/processes/connectors/notion/types/blocks/quote.py,sha256=yl7npmdcO6oFNgTNGVN_Ihvzexv12Xwg1r4NWAOjILQ,1176
|
|
169
|
-
unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py,sha256=
|
|
169
|
+
unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py,sha256=aHu4yg8N1EDqZmMEHK7dd7fiQ8Mc8otHQLJPRDbkaT8,4049
|
|
170
170
|
unstructured_ingest/processes/connectors/notion/types/blocks/table.py,sha256=eYUlRp4uCwjy_eB0mLh7MGMe1qrr_hnOxXS5RfUM2DQ,1724
|
|
171
171
|
unstructured_ingest/processes/connectors/notion/types/blocks/table_of_contents.py,sha256=bR5DdecXFz468okM5WOs10DK8_14Dj7OCLSRusMZzsk,534
|
|
172
172
|
unstructured_ingest/processes/connectors/notion/types/blocks/template.py,sha256=bq2Vh2X7ptpofs9OZnATHySZe2DzbOLsNNfpEI70NgM,968
|
|
@@ -231,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
|
|
|
231
231
|
unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
|
|
232
232
|
unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
|
|
233
233
|
unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
|
|
234
|
-
unstructured_ingest-1.0.
|
|
235
|
-
unstructured_ingest-1.0.
|
|
236
|
-
unstructured_ingest-1.0.
|
|
237
|
-
unstructured_ingest-1.0.
|
|
238
|
-
unstructured_ingest-1.0.
|
|
234
|
+
unstructured_ingest-1.0.37.dist-info/METADATA,sha256=wct0um6qunVNGSNozJ0a3UatsfCHDyXG7p9XMNBCTcU,8747
|
|
235
|
+
unstructured_ingest-1.0.37.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
236
|
+
unstructured_ingest-1.0.37.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
237
|
+
unstructured_ingest-1.0.37.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
238
|
+
unstructured_ingest-1.0.37.dist-info/RECORD,,
|
|
File without changes
|
{unstructured_ingest-1.0.34.dist-info → unstructured_ingest-1.0.37.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.34.dist-info → unstructured_ingest-1.0.37.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|