unstructured-ingest 0.5.17__py3-none-any.whl → 0.5.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.5.17" # pragma: no cover
1
+ __version__ = "0.5.18" # pragma: no cover
@@ -192,6 +192,16 @@ def get_data(path: Union[Path, str]) -> list[dict]:
192
192
  logger.warning(f"failed to read {path} as parquet: {e}")
193
193
 
194
194
 
195
+ def get_json_data(path: Path) -> list[dict]:
196
+ with path.open() as f:
197
+ if path.suffix == ".json":
198
+ return json.load(f)
199
+ elif path.suffix == ".ndjson":
200
+ return ndjson.load(f)
201
+ else:
202
+ raise ValueError(f"Unsupported file type: {path}")
203
+
204
+
195
205
  def get_data_df(path: Path) -> pd.DataFrame:
196
206
  with path.open() as f:
197
207
  if path.suffix == ".json":
@@ -1,4 +1,3 @@
1
- import json
2
1
  import os
3
2
  import traceback
4
3
  from dataclasses import dataclass, field
@@ -11,7 +10,7 @@ import pandas as pd
11
10
  from pydantic import Field, Secret
12
11
 
13
12
  from unstructured_ingest.error import DestinationConnectionError
14
- from unstructured_ingest.utils.data_prep import get_data_df
13
+ from unstructured_ingest.utils.data_prep import get_data_df, get_json_data
15
14
  from unstructured_ingest.utils.dep_check import requires_dependencies
16
15
  from unstructured_ingest.utils.table import convert_to_pandas_dataframe
17
16
  from unstructured_ingest.v2.interfaces import (
@@ -86,9 +85,7 @@ class DeltaTableUploadStager(UploadStager):
86
85
  output_filename: str,
87
86
  **kwargs: Any,
88
87
  ) -> Path:
89
- with open(elements_filepath) as elements_file:
90
- elements_contents = json.load(elements_file)
91
-
88
+ elements_contents = get_json_data(elements_filepath)
92
89
  output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
93
90
 
94
91
  df = convert_to_pandas_dataframe(elements_dict=elements_contents)
@@ -1,7 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
- import json
5
4
  import uuid
6
5
  from collections import defaultdict
7
6
  from contextlib import asynccontextmanager
@@ -14,7 +13,7 @@ from pydantic import BaseModel, ConfigDict, Field, Secret, field_validator
14
13
 
15
14
  from unstructured_ingest.error import DestinationConnectionError
16
15
  from unstructured_ingest.logger import logger
17
- from unstructured_ingest.utils.data_prep import batch_generator
16
+ from unstructured_ingest.utils.data_prep import batch_generator, get_json_data
18
17
  from unstructured_ingest.utils.dep_check import requires_dependencies
19
18
  from unstructured_ingest.v2.interfaces import (
20
19
  AccessConfig,
@@ -97,8 +96,7 @@ class Neo4jUploadStager(UploadStager):
97
96
  output_filename: str,
98
97
  **kwargs: Any,
99
98
  ) -> Path:
100
- with elements_filepath.open() as file:
101
- elements = json.load(file)
99
+ elements = get_json_data(elements_filepath)
102
100
 
103
101
  nx_graph = self._create_lexical_graph(
104
102
  elements, self._create_document_node(file_data=file_data)
@@ -294,8 +292,7 @@ class Neo4jUploader(Uploader):
294
292
  return True
295
293
 
296
294
  async def run_async(self, path: Path, file_data: FileData, **kwargs) -> None: # type: ignore
297
- with path.open() as file:
298
- staged_data = json.load(file)
295
+ staged_data = get_json_data(path)
299
296
 
300
297
  graph_data = _GraphData.model_validate(staged_data)
301
298
  async with self.connection_config.get_client() as client:
@@ -35,6 +35,10 @@ from unstructured_ingest.v2.processes.connector_registry import (
35
35
  DestinationRegistryEntry,
36
36
  SourceRegistryEntry,
37
37
  )
38
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
39
+ BlobStoreUploadStager,
40
+ BlobStoreUploadStagerConfig,
41
+ )
38
42
 
39
43
  if TYPE_CHECKING:
40
44
  from office365.graph_client import GraphClient
@@ -428,4 +432,6 @@ onedrive_destination_entry = DestinationRegistryEntry(
428
432
  connection_config=OnedriveConnectionConfig,
429
433
  uploader=OnedriveUploader,
430
434
  uploader_config=OnedriveUploaderConfig,
435
+ upload_stager_config=BlobStoreUploadStagerConfig,
436
+ upload_stager=BlobStoreUploadStager,
431
437
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.17
3
+ Version: 0.5.18
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -25,10 +25,10 @@ License-File: LICENSE.md
25
25
  Requires-Dist: python-dateutil
26
26
  Requires-Dist: opentelemetry-sdk
27
27
  Requires-Dist: pandas
28
- Requires-Dist: click
29
28
  Requires-Dist: dataclasses_json
30
- Requires-Dist: pydantic>=2.7
31
29
  Requires-Dist: tqdm
30
+ Requires-Dist: click
31
+ Requires-Dist: pydantic>=2.7
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
34
34
  Provides-Extra: csv
@@ -66,8 +66,8 @@ Requires-Dist: pyairtable; extra == "airtable"
66
66
  Provides-Extra: astradb
67
67
  Requires-Dist: astrapy; extra == "astradb"
68
68
  Provides-Extra: azure
69
- Requires-Dist: fsspec; extra == "azure"
70
69
  Requires-Dist: adlfs; extra == "azure"
70
+ Requires-Dist: fsspec; extra == "azure"
71
71
  Provides-Extra: azure-ai-search
72
72
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
73
73
  Provides-Extra: biomed
@@ -81,29 +81,29 @@ Requires-Dist: chromadb; extra == "chroma"
81
81
  Provides-Extra: clarifai
82
82
  Requires-Dist: clarifai; extra == "clarifai"
83
83
  Provides-Extra: confluence
84
- Requires-Dist: atlassian-python-api; extra == "confluence"
85
84
  Requires-Dist: requests; extra == "confluence"
85
+ Requires-Dist: atlassian-python-api; extra == "confluence"
86
86
  Provides-Extra: couchbase
87
87
  Requires-Dist: couchbase; extra == "couchbase"
88
88
  Provides-Extra: delta-table
89
- Requires-Dist: boto3; extra == "delta-table"
90
89
  Requires-Dist: deltalake; extra == "delta-table"
90
+ Requires-Dist: boto3; extra == "delta-table"
91
91
  Provides-Extra: discord
92
92
  Requires-Dist: discord.py; extra == "discord"
93
93
  Provides-Extra: dropbox
94
- Requires-Dist: dropboxdrivefs; extra == "dropbox"
95
94
  Requires-Dist: fsspec; extra == "dropbox"
95
+ Requires-Dist: dropboxdrivefs; extra == "dropbox"
96
96
  Provides-Extra: duckdb
97
97
  Requires-Dist: duckdb; extra == "duckdb"
98
98
  Provides-Extra: elasticsearch
99
99
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
100
100
  Provides-Extra: gcs
101
101
  Requires-Dist: bs4; extra == "gcs"
102
- Requires-Dist: gcsfs; extra == "gcs"
103
102
  Requires-Dist: fsspec; extra == "gcs"
103
+ Requires-Dist: gcsfs; extra == "gcs"
104
104
  Provides-Extra: github
105
- Requires-Dist: requests; extra == "github"
106
105
  Requires-Dist: pygithub>1.58.0; extra == "github"
106
+ Requires-Dist: requests; extra == "github"
107
107
  Provides-Extra: gitlab
108
108
  Requires-Dist: python-gitlab; extra == "gitlab"
109
109
  Provides-Extra: google-drive
@@ -128,19 +128,19 @@ Requires-Dist: networkx; extra == "neo4j"
128
128
  Requires-Dist: neo4j-rust-ext; extra == "neo4j"
129
129
  Requires-Dist: cymple; extra == "neo4j"
130
130
  Provides-Extra: notion
131
- Requires-Dist: htmlBuilder; extra == "notion"
132
- Requires-Dist: notion-client; extra == "notion"
133
131
  Requires-Dist: httpx; extra == "notion"
134
132
  Requires-Dist: backoff; extra == "notion"
133
+ Requires-Dist: notion-client; extra == "notion"
134
+ Requires-Dist: htmlBuilder; extra == "notion"
135
135
  Provides-Extra: onedrive
136
136
  Requires-Dist: bs4; extra == "onedrive"
137
- Requires-Dist: msal; extra == "onedrive"
138
137
  Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
138
+ Requires-Dist: msal; extra == "onedrive"
139
139
  Provides-Extra: opensearch
140
140
  Requires-Dist: opensearch-py; extra == "opensearch"
141
141
  Provides-Extra: outlook
142
- Requires-Dist: msal; extra == "outlook"
143
142
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
143
+ Requires-Dist: msal; extra == "outlook"
144
144
  Provides-Extra: pinecone
145
145
  Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
146
146
  Provides-Extra: postgres
@@ -155,8 +155,8 @@ Provides-Extra: s3
155
155
  Requires-Dist: fsspec; extra == "s3"
156
156
  Requires-Dist: s3fs; extra == "s3"
157
157
  Provides-Extra: sharepoint
158
- Requires-Dist: msal; extra == "sharepoint"
159
158
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
159
+ Requires-Dist: msal; extra == "sharepoint"
160
160
  Provides-Extra: salesforce
161
161
  Requires-Dist: simple-salesforce; extra == "salesforce"
162
162
  Provides-Extra: sftp
@@ -165,8 +165,8 @@ Requires-Dist: fsspec; extra == "sftp"
165
165
  Provides-Extra: slack
166
166
  Requires-Dist: slack_sdk[optional]; extra == "slack"
167
167
  Provides-Extra: snowflake
168
- Requires-Dist: snowflake-connector-python; extra == "snowflake"
169
168
  Requires-Dist: psycopg2-binary; extra == "snowflake"
169
+ Requires-Dist: snowflake-connector-python; extra == "snowflake"
170
170
  Provides-Extra: wikipedia
171
171
  Requires-Dist: wikipedia; extra == "wikipedia"
172
172
  Provides-Extra: weaviate
@@ -178,17 +178,17 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
178
178
  Provides-Extra: singlestore
179
179
  Requires-Dist: singlestoredb; extra == "singlestore"
180
180
  Provides-Extra: vectara
181
+ Requires-Dist: aiofiles; extra == "vectara"
181
182
  Requires-Dist: httpx; extra == "vectara"
182
183
  Requires-Dist: requests; extra == "vectara"
183
- Requires-Dist: aiofiles; extra == "vectara"
184
184
  Provides-Extra: vastdb
185
- Requires-Dist: pyarrow; extra == "vastdb"
186
185
  Requires-Dist: vastdb; extra == "vastdb"
186
+ Requires-Dist: pyarrow; extra == "vastdb"
187
187
  Requires-Dist: ibis; extra == "vastdb"
188
188
  Provides-Extra: zendesk
189
+ Requires-Dist: aiofiles; extra == "zendesk"
189
190
  Requires-Dist: httpx; extra == "zendesk"
190
191
  Requires-Dist: bs4; extra == "zendesk"
191
- Requires-Dist: aiofiles; extra == "zendesk"
192
192
  Provides-Extra: embed-huggingface
193
193
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
194
194
  Provides-Extra: embed-octoai
@@ -111,7 +111,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
111
111
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
112
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
113
113
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
114
- unstructured_ingest/__version__.py,sha256=KerQQWRY3fKuaApvHFeNECgY6_9Sn7bl2FfaUuLCr4c,43
114
+ unstructured_ingest/__version__.py,sha256=QYn6GUOSyCz_KH2wi4yg_FlUU4SE844Xhf0hR6-jv8s,43
115
115
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
116
116
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
117
117
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -370,7 +370,7 @@ unstructured_ingest/runner/writers/fsspec/s3.py,sha256=kHJq2O3864QBd_tL2SKb0mdyw
370
370
  unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
371
371
  unstructured_ingest/utils/chunking.py,sha256=9b3sXMA6L8RW5xAkKQbwdtVudGLAcj_sgT6Grh5tyYM,1870
372
372
  unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
373
- unstructured_ingest/utils/data_prep.py,sha256=AKtsdu9stYA63CV1C5B_fFWigqy-giVv-euumitos-A,7266
373
+ unstructured_ingest/utils/data_prep.py,sha256=MfID_7SPZHeZztlNTSXIzilaWvv1mdfCcLlhqpGLYNg,7557
374
374
  unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
375
375
  unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
376
376
  unstructured_ingest/utils/html.py,sha256=DGRDMqGbwH8RiF94Qh6NiqVkbbjZfe1h26dIehC-X7M,6340
@@ -433,7 +433,7 @@ unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6
433
433
  unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
434
434
  unstructured_ingest/v2/processes/connectors/confluence.py,sha256=gSs4-AxL0gfeWdJfP7JfCrQSQNLoJRkvHquKK9RJvpQ,12043
435
435
  unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVmg4--MO0ZgbjvhIqt46oYqk9zFSQ,12250
436
- unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=xvLWTSFEC3gyGTwEISXxWmUoAfCgzdgZkETMMBOPHuI,7153
436
+ unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=bfEGiepJLOS9TxK-bMkjTTjHLHUc0q7qUzIYdwkLDMs,7104
437
437
  unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
438
438
  unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5BsGXwSaL7RbEXcaagJQYsylNo,9984
439
439
  unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=QzcHNelUbnubsDtanFIgDCRzmYTuP-GjJ_g9y8fButE,19623
@@ -442,8 +442,8 @@ unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOG
442
442
  unstructured_ingest/v2/processes/connectors/local.py,sha256=FWPRjjUsnQjyZMChuZGuMU04AB5X0sFEOcAXhx1r9sk,7381
443
443
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
444
444
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
445
- unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=vxf6Xuh-OMS09Y-mIF0PIwrFauqRtoI7vjeLBXsFwTk,18744
446
- unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
445
+ unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=I-eDLAlThHKKFQfkZpQL8CLFBDy5krWgTQANLgMTwTk,18679
446
+ unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=5rg7t40gKxDHNcuJrJHmVzJ9uM7Ct4RBOvFsfwdGc5c,18002
447
447
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
448
448
  unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=O9lC4mZ9V_exg9apiCJSWHsgkuYDSEOlI6CaUS5ZB7c,13961
449
449
  unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
@@ -577,9 +577,9 @@ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=DDAYQB7catK
577
577
  unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=R8SXYkRhVUoWEHdGCt2CzcTxxuFundw_0GlGZ34YmbM,8987
578
578
  unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
579
579
  unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
580
- unstructured_ingest-0.5.17.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
581
- unstructured_ingest-0.5.17.dist-info/METADATA,sha256=cfGRkKOAflmCnSh-KoaCVyBcWD2SN_onWWc0tVaZ8fc,8465
582
- unstructured_ingest-0.5.17.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
583
- unstructured_ingest-0.5.17.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
584
- unstructured_ingest-0.5.17.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
585
- unstructured_ingest-0.5.17.dist-info/RECORD,,
580
+ unstructured_ingest-0.5.18.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
581
+ unstructured_ingest-0.5.18.dist-info/METADATA,sha256=K47-NP1RfNwqRnvbZ8vO75ab5J5RSmb5nocwSXNwqko,8465
582
+ unstructured_ingest-0.5.18.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
583
+ unstructured_ingest-0.5.18.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
584
+ unstructured_ingest-0.5.18.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
585
+ unstructured_ingest-0.5.18.dist-info/RECORD,,