unstructured-ingest 0.5.12__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.5.12" # pragma: no cover
1
+ __version__ = "0.5.13" # pragma: no cover
@@ -154,7 +154,7 @@ class Neo4jUploadStager(UploadStager):
154
154
  self._add_entities(element, graph, element_node)
155
155
 
156
156
  if self._is_chunk(element):
157
- for origin_element in format_and_truncate_orig_elements(element):
157
+ for origin_element in format_and_truncate_orig_elements(element, include_text=True):
158
158
  origin_element_node = self._create_element_node(origin_element)
159
159
 
160
160
  graph.add_edge(
@@ -327,13 +327,16 @@ class Neo4jUploader(Uploader):
327
327
  async def _create_vector_index(
328
328
  self, client: AsyncDriver, dimensions: int, similarity_function: SimilarityFunction
329
329
  ) -> None:
330
+ import neo4j.exceptions
331
+
330
332
  label = Label.CHUNK
331
333
  logger.info(
332
334
  f"Creating index on nodes labeled '{label.value}' if it does not already exist."
333
335
  )
334
336
  index_name = f"{label.value.lower()}_vector"
335
- await client.execute_query(
336
- f"""
337
+ try:
338
+ await client.execute_query(
339
+ f"""
337
340
  CREATE VECTOR INDEX {index_name} IF NOT EXISTS
338
341
  FOR (n:{label.value}) ON n.embedding
339
342
  OPTIONS {{indexConfig: {{
@@ -341,7 +344,12 @@ class Neo4jUploader(Uploader):
341
344
  `vector.dimensions`: {dimensions}}}
342
345
  }}
343
346
  """
344
- )
347
+ )
348
+ except neo4j.exceptions.ClientError as e:
349
+ if e.code == "Neo.ClientError.Schema.EquivalentSchemaRuleAlreadyExists":
350
+ logger.info(f"Index on nodes labeled '{label.value}' already exists.")
351
+ else:
352
+ raise
345
353
 
346
354
  async def _delete_old_data_if_exists(self, file_data: FileData, client: AsyncDriver) -> None:
347
355
  logger.info(f"Deleting old data for the record '{file_data.identifier}' (if present).")
@@ -31,7 +31,9 @@ def conform_string_to_dict(value: Any) -> dict:
31
31
  raise ValidationError(f"Input could not be mapped to a valid dict: {value}")
32
32
 
33
33
 
34
- def format_and_truncate_orig_elements(element: dict) -> list[dict[str, Any]]:
34
+ def format_and_truncate_orig_elements(
35
+ element: dict, include_text: bool = False
36
+ ) -> list[dict[str, Any]]:
35
37
  """
36
38
  This function is used to format and truncate the orig_elements field in the metadata.
37
39
  This is used to remove the text field and other larger fields from the orig_elements
@@ -42,7 +44,8 @@ def format_and_truncate_orig_elements(element: dict) -> list[dict[str, Any]]:
42
44
  orig_elements = []
43
45
  if raw_orig_elements is not None:
44
46
  for element in elements_from_base64_gzipped_json(raw_orig_elements):
45
- element.pop("text", None)
47
+ if not include_text:
48
+ element.pop("text", None)
46
49
  for prop in (
47
50
  "image_base64",
48
51
  "text_as_html",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.12
3
+ Version: 0.5.13
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,13 +22,13 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
- Requires-Dist: dataclasses_json
26
25
  Requires-Dist: python-dateutil
27
- Requires-Dist: click
28
- Requires-Dist: pydantic>=2.7
29
- Requires-Dist: tqdm
26
+ Requires-Dist: dataclasses_json
30
27
  Requires-Dist: pandas
28
+ Requires-Dist: pydantic>=2.7
29
+ Requires-Dist: click
31
30
  Requires-Dist: opentelemetry-sdk
31
+ Requires-Dist: tqdm
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
34
34
  Provides-Extra: csv
@@ -66,23 +66,23 @@ Requires-Dist: pyairtable; extra == "airtable"
66
66
  Provides-Extra: astradb
67
67
  Requires-Dist: astrapy; extra == "astradb"
68
68
  Provides-Extra: azure
69
- Requires-Dist: fsspec; extra == "azure"
70
69
  Requires-Dist: adlfs; extra == "azure"
70
+ Requires-Dist: fsspec; extra == "azure"
71
71
  Provides-Extra: azure-ai-search
72
72
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
73
73
  Provides-Extra: biomed
74
- Requires-Dist: bs4; extra == "biomed"
75
74
  Requires-Dist: requests; extra == "biomed"
75
+ Requires-Dist: bs4; extra == "biomed"
76
76
  Provides-Extra: box
77
- Requires-Dist: fsspec; extra == "box"
78
77
  Requires-Dist: boxfs; extra == "box"
78
+ Requires-Dist: fsspec; extra == "box"
79
79
  Provides-Extra: chroma
80
80
  Requires-Dist: chromadb; extra == "chroma"
81
81
  Provides-Extra: clarifai
82
82
  Requires-Dist: clarifai; extra == "clarifai"
83
83
  Provides-Extra: confluence
84
- Requires-Dist: atlassian-python-api; extra == "confluence"
85
84
  Requires-Dist: requests; extra == "confluence"
85
+ Requires-Dist: atlassian-python-api; extra == "confluence"
86
86
  Provides-Extra: couchbase
87
87
  Requires-Dist: couchbase; extra == "couchbase"
88
88
  Provides-Extra: delta-table
@@ -98,19 +98,19 @@ Requires-Dist: duckdb; extra == "duckdb"
98
98
  Provides-Extra: elasticsearch
99
99
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
100
100
  Provides-Extra: gcs
101
- Requires-Dist: bs4; extra == "gcs"
102
101
  Requires-Dist: gcsfs; extra == "gcs"
102
+ Requires-Dist: bs4; extra == "gcs"
103
103
  Requires-Dist: fsspec; extra == "gcs"
104
104
  Provides-Extra: github
105
- Requires-Dist: pygithub>1.58.0; extra == "github"
106
105
  Requires-Dist: requests; extra == "github"
106
+ Requires-Dist: pygithub>1.58.0; extra == "github"
107
107
  Provides-Extra: gitlab
108
108
  Requires-Dist: python-gitlab; extra == "gitlab"
109
109
  Provides-Extra: google-drive
110
110
  Requires-Dist: google-api-python-client; extra == "google-drive"
111
111
  Provides-Extra: hubspot
112
- Requires-Dist: urllib3; extra == "hubspot"
113
112
  Requires-Dist: hubspot-api-client; extra == "hubspot"
113
+ Requires-Dist: urllib3; extra == "hubspot"
114
114
  Provides-Extra: jira
115
115
  Requires-Dist: atlassian-python-api; extra == "jira"
116
116
  Provides-Extra: kafka
@@ -124,23 +124,23 @@ Requires-Dist: pymilvus; extra == "milvus"
124
124
  Provides-Extra: mongodb
125
125
  Requires-Dist: pymongo; extra == "mongodb"
126
126
  Provides-Extra: neo4j
127
+ Requires-Dist: cymple; extra == "neo4j"
127
128
  Requires-Dist: neo4j-rust-ext; extra == "neo4j"
128
129
  Requires-Dist: networkx; extra == "neo4j"
129
- Requires-Dist: cymple; extra == "neo4j"
130
130
  Provides-Extra: notion
131
- Requires-Dist: htmlBuilder; extra == "notion"
132
- Requires-Dist: httpx; extra == "notion"
133
131
  Requires-Dist: notion-client; extra == "notion"
132
+ Requires-Dist: htmlBuilder; extra == "notion"
134
133
  Requires-Dist: backoff; extra == "notion"
134
+ Requires-Dist: httpx; extra == "notion"
135
135
  Provides-Extra: onedrive
136
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
136
137
  Requires-Dist: msal; extra == "onedrive"
137
138
  Requires-Dist: bs4; extra == "onedrive"
138
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
139
139
  Provides-Extra: opensearch
140
140
  Requires-Dist: opensearch-py; extra == "opensearch"
141
141
  Provides-Extra: outlook
142
- Requires-Dist: msal; extra == "outlook"
143
142
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
143
+ Requires-Dist: msal; extra == "outlook"
144
144
  Provides-Extra: pinecone
145
145
  Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
146
146
  Provides-Extra: postgres
@@ -155,8 +155,8 @@ Provides-Extra: s3
155
155
  Requires-Dist: fsspec; extra == "s3"
156
156
  Requires-Dist: s3fs; extra == "s3"
157
157
  Provides-Extra: sharepoint
158
- Requires-Dist: msal; extra == "sharepoint"
159
158
  Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
159
+ Requires-Dist: msal; extra == "sharepoint"
160
160
  Provides-Extra: salesforce
161
161
  Requires-Dist: simple-salesforce; extra == "salesforce"
162
162
  Provides-Extra: sftp
@@ -178,22 +178,22 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
178
178
  Provides-Extra: singlestore
179
179
  Requires-Dist: singlestoredb; extra == "singlestore"
180
180
  Provides-Extra: vectara
181
- Requires-Dist: httpx; extra == "vectara"
182
181
  Requires-Dist: requests; extra == "vectara"
183
182
  Requires-Dist: aiofiles; extra == "vectara"
183
+ Requires-Dist: httpx; extra == "vectara"
184
184
  Provides-Extra: vastdb
185
185
  Requires-Dist: ibis; extra == "vastdb"
186
186
  Requires-Dist: vastdb; extra == "vastdb"
187
187
  Requires-Dist: pyarrow; extra == "vastdb"
188
188
  Provides-Extra: zendesk
189
- Requires-Dist: httpx; extra == "zendesk"
190
189
  Requires-Dist: bs4; extra == "zendesk"
191
190
  Requires-Dist: aiofiles; extra == "zendesk"
191
+ Requires-Dist: httpx; extra == "zendesk"
192
192
  Provides-Extra: embed-huggingface
193
193
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
194
194
  Provides-Extra: embed-octoai
195
- Requires-Dist: openai; extra == "embed-octoai"
196
195
  Requires-Dist: tiktoken; extra == "embed-octoai"
196
+ Requires-Dist: openai; extra == "embed-octoai"
197
197
  Provides-Extra: embed-vertexai
198
198
  Requires-Dist: vertexai; extra == "embed-vertexai"
199
199
  Provides-Extra: embed-voyageai
@@ -201,8 +201,8 @@ Requires-Dist: voyageai; extra == "embed-voyageai"
201
201
  Provides-Extra: embed-mixedbreadai
202
202
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
203
203
  Provides-Extra: openai
204
- Requires-Dist: openai; extra == "openai"
205
204
  Requires-Dist: tiktoken; extra == "openai"
205
+ Requires-Dist: openai; extra == "openai"
206
206
  Provides-Extra: bedrock
207
207
  Requires-Dist: boto3; extra == "bedrock"
208
208
  Requires-Dist: aioboto3; extra == "bedrock"
@@ -111,7 +111,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
111
111
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
112
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
113
113
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
114
- unstructured_ingest/__version__.py,sha256=RrghV4wKBupxOt4yEkkqnsRGy9iuYIS7TnVXDOGMwI8,43
114
+ unstructured_ingest/__version__.py,sha256=SoXJvWNbP_dJpjjbYHn-IaGnsULa9X8yicnEFO_W3yI,43
115
115
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
116
116
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
117
117
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -442,7 +442,7 @@ unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=VRDAiou_7oWOIAgQTdOG
442
442
  unstructured_ingest/v2/processes/connectors/local.py,sha256=ZvWTj6ZYkwnvQMNFsZWoaQyp9zp0WVqAywMaHJ2kcAc,7153
443
443
  unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
444
444
  unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
445
- unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=QO8D0fptJ6C_K4m34KcYvAcrBgdve_Kcb0UVvY4edXs,18405
445
+ unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=vxf6Xuh-OMS09Y-mIF0PIwrFauqRtoI7vjeLBXsFwTk,18744
446
446
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=EM9fq67RsiudZvZbi6nDXkS-i6W0xLvbkNvD0G-Ni5E,17779
447
447
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
448
448
  unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=O9lC4mZ9V_exg9apiCJSWHsgkuYDSEOlI6CaUS5ZB7c,13961
@@ -450,7 +450,7 @@ unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bW
450
450
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
451
451
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=2T9Bm1H_ALwHhG_YP7vsuUUW-mUg61zcaae3aa9BnN4,4827
452
452
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
453
- unstructured_ingest/v2/processes/connectors/utils.py,sha256=ru_4e5lo5t1jJhR8sGYa5nNhX3gKTgC5B7Oze9qQJjo,2000
453
+ unstructured_ingest/v2/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
454
454
  unstructured_ingest/v2/processes/connectors/vectara.py,sha256=BlI_4nkpNR99aYxDd9eusm5LQsVB9EI0r-5Kc1D7pgQ,12255
455
455
  unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
456
456
  unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
@@ -575,9 +575,9 @@ unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py,sha256=UZ_s8dnV
575
575
  unstructured_ingest/v2/processes/connectors/zendesk/__init__.py,sha256=XMNocKJ3FHDfy36p_KHhH7ALi0-ji6NhGuQNCV2E4vY,699
576
576
  unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=MNyI6SUuxZHf_6zONoC6jR2f9BvyTYoMyGKDOhl4kgs,7897
577
577
  unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=vQHZa5YYiDPXXPRAPMnPXhh0QzXeiBVx_YIWskZBQIc,15465
578
- unstructured_ingest-0.5.12.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
579
- unstructured_ingest-0.5.12.dist-info/METADATA,sha256=-IT5G8E8WZLSABmgyUZtxzQElpUVzKuVPqK54zFC-V0,8465
580
- unstructured_ingest-0.5.12.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
581
- unstructured_ingest-0.5.12.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
582
- unstructured_ingest-0.5.12.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
583
- unstructured_ingest-0.5.12.dist-info/RECORD,,
578
+ unstructured_ingest-0.5.13.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
579
+ unstructured_ingest-0.5.13.dist-info/METADATA,sha256=K95xEzr8Tq75w90-ivlwvfFhRkNRTPnNmtIiRXDXhjs,8465
580
+ unstructured_ingest-0.5.13.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
581
+ unstructured_ingest-0.5.13.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
582
+ unstructured_ingest-0.5.13.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
583
+ unstructured_ingest-0.5.13.dist-info/RECORD,,