unstructured-ingest 0.5.18__py3-none-any.whl → 0.5.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -23,20 +23,22 @@ from unstructured_ingest.v2.processes.connectors.redisdb import (
23
23
  )
24
24
 
25
25
 
26
- async def delete_record(client: Redis, element_id: str) -> None:
27
- await client.delete(element_id)
26
+ async def delete_record(client: Redis, element_id: str, key_prefix: str) -> None:
27
+ key_with_prefix = f"{key_prefix}{element_id}"
28
+ await client.delete(key_with_prefix)
28
29
 
29
30
 
30
- async def validate_upload(client: Redis, first_element: dict):
31
+ async def validate_upload(client: Redis, first_element: dict, key_prefix: str) -> None:
31
32
  element_id = first_element["element_id"]
33
+ key_with_prefix = f"{key_prefix}{element_id}"
32
34
  expected_text = first_element["text"]
33
35
  expected_embeddings = first_element["embeddings"]
34
36
  async with client.pipeline(transaction=True) as pipe:
35
37
  try:
36
- response = await pipe.json().get(element_id, "$").execute()
38
+ response = await pipe.json().get(key_with_prefix, "$").execute()
37
39
  response = response[0][0]
38
40
  except redis_exceptions.ResponseError:
39
- response = await pipe.get(element_id).execute()
41
+ response = await pipe.get(key_with_prefix).execute()
40
42
  response = json.loads(response[0])
41
43
 
42
44
  embedding_similarity = np.linalg.norm(
@@ -53,6 +55,7 @@ async def redis_destination_test(
53
55
  upload_file: Path,
54
56
  tmp_path: Path,
55
57
  connection_kwargs: dict,
58
+ uploader_config: dict,
56
59
  uri: Optional[str] = None,
57
60
  password: Optional[str] = None,
58
61
  ):
@@ -60,8 +63,9 @@ async def redis_destination_test(
60
63
  connection_config=RedisConnectionConfig(
61
64
  **connection_kwargs, access_config=RedisAccessConfig(uri=uri, password=password)
62
65
  ),
63
- upload_config=RedisUploaderConfig(batch_size=10),
66
+ upload_config=RedisUploaderConfig(batch_size=10, **uploader_config),
64
67
  )
68
+ key_prefix = uploader.upload_config.key_prefix
65
69
 
66
70
  file_data = FileData(
67
71
  source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
@@ -78,20 +82,32 @@ async def redis_destination_test(
78
82
 
79
83
  if uri:
80
84
  async with from_url(uri) as client:
81
- await validate_upload(client=client, first_element=first_element)
85
+ await validate_upload(
86
+ client=client,
87
+ first_element=first_element,
88
+ key_prefix=key_prefix,
89
+ )
82
90
  else:
83
91
  async with Redis(**connection_kwargs, password=password) as client:
84
- await validate_upload(client=client, first_element=first_element)
92
+ await validate_upload(
93
+ client=client,
94
+ first_element=first_element,
95
+ key_prefix=key_prefix,
96
+ )
85
97
  except Exception as e:
86
98
  raise e
87
99
  finally:
88
100
  if uri:
89
101
  async with from_url(uri) as client:
90
- tasks = [delete_record(client, element["element_id"]) for element in elements]
102
+ tasks = [
103
+ delete_record(client, element["element_id"], key_prefix) for element in elements
104
+ ]
91
105
  await asyncio.gather(*tasks)
92
106
  else:
93
107
  async with Redis(**connection_kwargs, password=password) as client:
94
- tasks = [delete_record(client, element["element_id"]) for element in elements]
108
+ tasks = [
109
+ delete_record(client, element["element_id"], key_prefix) for element in elements
110
+ ]
95
111
  await asyncio.gather(*tasks)
96
112
 
97
113
 
@@ -105,8 +121,13 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
105
121
  "db": 0,
106
122
  "ssl": True,
107
123
  }
124
+ uploader_config = {
125
+ "key_prefix": "test_ingest:",
126
+ }
108
127
  redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
109
- await redis_destination_test(upload_file, tmp_path, connection_kwargs, password=redis_pw)
128
+ await redis_destination_test(
129
+ upload_file, tmp_path, connection_kwargs, uploader_config, password=redis_pw
130
+ )
110
131
 
111
132
 
112
133
  @pytest.mark.asyncio
@@ -114,6 +135,9 @@ async def test_redis_destination_azure_with_password(upload_file: Path, tmp_path
114
135
  @requires_env("AZURE_REDIS_INGEST_TEST_PASSWORD")
115
136
  async def test_redis_destination_azure_with_uri(upload_file: Path, tmp_path: Path):
116
137
  connection_kwargs = {}
138
+ uploader_config = {
139
+ "key_prefix": "test_ingest:",
140
+ }
117
141
  redis_pw = os.environ["AZURE_REDIS_INGEST_TEST_PASSWORD"]
118
142
  uri = f"rediss://:{redis_pw}@utic-dashboard-dev.redis.cache.windows.net:6380/0"
119
- await redis_destination_test(upload_file, tmp_path, connection_kwargs, uri=uri)
143
+ await redis_destination_test(upload_file, tmp_path, connection_kwargs, uploader_config, uri=uri)
@@ -1 +1 @@
1
- __version__ = "0.5.18" # pragma: no cover
1
+ __version__ = "0.5.19" # pragma: no cover
@@ -110,6 +110,7 @@ class RedisConnectionConfig(ConnectionConfig):
110
110
 
111
111
  class RedisUploaderConfig(UploaderConfig):
112
112
  batch_size: int = Field(default=100, description="Number of records per batch")
113
+ key_prefix: str = Field(default="", description="Prefix for Redis keys")
113
114
 
114
115
 
115
116
  @dataclass
@@ -145,11 +146,11 @@ class RedisUploader(Uploader):
145
146
  async with self.connection_config.create_async_client() as async_client:
146
147
  async with async_client.pipeline(transaction=True) as pipe:
147
148
  for element in batch:
148
- element_id = element["element_id"]
149
+ key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
149
150
  if redis_stack:
150
- pipe.json().set(element_id, "$", element)
151
+ pipe.json().set(key_with_prefix, "$", element)
151
152
  else:
152
- pipe.set(element_id, json.dumps(element))
153
+ pipe.set(key_with_prefix, json.dumps(element))
153
154
  await pipe.execute()
154
155
 
155
156
  @requires_dependencies(["redis"], extras="redis")
@@ -159,16 +160,16 @@ class RedisUploader(Uploader):
159
160
  redis_stack = True
160
161
  async with self.connection_config.create_async_client() as async_client:
161
162
  async with async_client.pipeline(transaction=True) as pipe:
162
- element_id = element["element_id"]
163
+ key_with_prefix = f"{self.upload_config.key_prefix}{element['element_id']}"
163
164
  try:
164
165
  # Redis with stack extension supports JSON type
165
- await pipe.json().set(element_id, "$", element).execute()
166
+ await pipe.json().set(key_with_prefix, "$", element).execute()
166
167
  except redis_exceptions.ResponseError as e:
167
168
  message = str(e)
168
169
  if "unknown command `JSON.SET`" in message:
169
170
  # if this error occurs, Redis server doesn't support JSON type,
170
171
  # so save as string type instead
171
- await pipe.set(element_id, json.dumps(element)).execute()
172
+ await pipe.set(key_with_prefix, json.dumps(element)).execute()
172
173
  redis_stack = False
173
174
  else:
174
175
  raise e
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.5.18
3
+ Version: 0.5.19
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -23,12 +23,12 @@ Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
25
  Requires-Dist: python-dateutil
26
+ Requires-Dist: dataclasses_json
26
27
  Requires-Dist: opentelemetry-sdk
27
28
  Requires-Dist: pandas
28
- Requires-Dist: dataclasses_json
29
- Requires-Dist: tqdm
30
29
  Requires-Dist: click
31
30
  Requires-Dist: pydantic>=2.7
31
+ Requires-Dist: tqdm
32
32
  Provides-Extra: remote
33
33
  Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
34
34
  Provides-Extra: csv
@@ -66,16 +66,16 @@ Requires-Dist: pyairtable; extra == "airtable"
66
66
  Provides-Extra: astradb
67
67
  Requires-Dist: astrapy; extra == "astradb"
68
68
  Provides-Extra: azure
69
- Requires-Dist: adlfs; extra == "azure"
70
69
  Requires-Dist: fsspec; extra == "azure"
70
+ Requires-Dist: adlfs; extra == "azure"
71
71
  Provides-Extra: azure-ai-search
72
72
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
73
73
  Provides-Extra: biomed
74
- Requires-Dist: bs4; extra == "biomed"
75
74
  Requires-Dist: requests; extra == "biomed"
75
+ Requires-Dist: bs4; extra == "biomed"
76
76
  Provides-Extra: box
77
- Requires-Dist: boxfs; extra == "box"
78
77
  Requires-Dist: fsspec; extra == "box"
78
+ Requires-Dist: boxfs; extra == "box"
79
79
  Provides-Extra: chroma
80
80
  Requires-Dist: chromadb; extra == "chroma"
81
81
  Provides-Extra: clarifai
@@ -98,12 +98,12 @@ Requires-Dist: duckdb; extra == "duckdb"
98
98
  Provides-Extra: elasticsearch
99
99
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
100
100
  Provides-Extra: gcs
101
+ Requires-Dist: gcsfs; extra == "gcs"
101
102
  Requires-Dist: bs4; extra == "gcs"
102
103
  Requires-Dist: fsspec; extra == "gcs"
103
- Requires-Dist: gcsfs; extra == "gcs"
104
104
  Provides-Extra: github
105
- Requires-Dist: pygithub>1.58.0; extra == "github"
106
105
  Requires-Dist: requests; extra == "github"
106
+ Requires-Dist: pygithub>1.58.0; extra == "github"
107
107
  Provides-Extra: gitlab
108
108
  Requires-Dist: python-gitlab; extra == "gitlab"
109
109
  Provides-Extra: google-drive
@@ -128,19 +128,19 @@ Requires-Dist: networkx; extra == "neo4j"
128
128
  Requires-Dist: neo4j-rust-ext; extra == "neo4j"
129
129
  Requires-Dist: cymple; extra == "neo4j"
130
130
  Provides-Extra: notion
131
- Requires-Dist: httpx; extra == "notion"
132
131
  Requires-Dist: backoff; extra == "notion"
132
+ Requires-Dist: httpx; extra == "notion"
133
133
  Requires-Dist: notion-client; extra == "notion"
134
134
  Requires-Dist: htmlBuilder; extra == "notion"
135
135
  Provides-Extra: onedrive
136
136
  Requires-Dist: bs4; extra == "onedrive"
137
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
138
137
  Requires-Dist: msal; extra == "onedrive"
138
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
139
139
  Provides-Extra: opensearch
140
140
  Requires-Dist: opensearch-py; extra == "opensearch"
141
141
  Provides-Extra: outlook
142
- Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
143
142
  Requires-Dist: msal; extra == "outlook"
143
+ Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
144
144
  Provides-Extra: pinecone
145
145
  Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
146
146
  Provides-Extra: postgres
@@ -152,16 +152,16 @@ Requires-Dist: praw; extra == "reddit"
152
152
  Provides-Extra: redis
153
153
  Requires-Dist: redis; extra == "redis"
154
154
  Provides-Extra: s3
155
- Requires-Dist: fsspec; extra == "s3"
156
155
  Requires-Dist: s3fs; extra == "s3"
156
+ Requires-Dist: fsspec; extra == "s3"
157
157
  Provides-Extra: sharepoint
158
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
159
158
  Requires-Dist: msal; extra == "sharepoint"
159
+ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
160
160
  Provides-Extra: salesforce
161
161
  Requires-Dist: simple-salesforce; extra == "salesforce"
162
162
  Provides-Extra: sftp
163
- Requires-Dist: paramiko; extra == "sftp"
164
163
  Requires-Dist: fsspec; extra == "sftp"
164
+ Requires-Dist: paramiko; extra == "sftp"
165
165
  Provides-Extra: slack
166
166
  Requires-Dist: slack_sdk[optional]; extra == "slack"
167
167
  Provides-Extra: snowflake
@@ -178,17 +178,17 @@ Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
178
178
  Provides-Extra: singlestore
179
179
  Requires-Dist: singlestoredb; extra == "singlestore"
180
180
  Provides-Extra: vectara
181
- Requires-Dist: aiofiles; extra == "vectara"
182
- Requires-Dist: httpx; extra == "vectara"
183
181
  Requires-Dist: requests; extra == "vectara"
182
+ Requires-Dist: httpx; extra == "vectara"
183
+ Requires-Dist: aiofiles; extra == "vectara"
184
184
  Provides-Extra: vastdb
185
- Requires-Dist: vastdb; extra == "vastdb"
186
185
  Requires-Dist: pyarrow; extra == "vastdb"
186
+ Requires-Dist: vastdb; extra == "vastdb"
187
187
  Requires-Dist: ibis; extra == "vastdb"
188
188
  Provides-Extra: zendesk
189
- Requires-Dist: aiofiles; extra == "zendesk"
190
- Requires-Dist: httpx; extra == "zendesk"
191
189
  Requires-Dist: bs4; extra == "zendesk"
190
+ Requires-Dist: httpx; extra == "zendesk"
191
+ Requires-Dist: aiofiles; extra == "zendesk"
192
192
  Provides-Extra: embed-huggingface
193
193
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
194
194
  Provides-Extra: embed-octoai
@@ -204,8 +204,8 @@ Provides-Extra: openai
204
204
  Requires-Dist: openai; extra == "openai"
205
205
  Requires-Dist: tiktoken; extra == "openai"
206
206
  Provides-Extra: bedrock
207
- Requires-Dist: aioboto3; extra == "bedrock"
208
207
  Requires-Dist: boto3; extra == "bedrock"
208
+ Requires-Dist: aioboto3; extra == "bedrock"
209
209
  Provides-Extra: togetherai
210
210
  Requires-Dist: together; extra == "togetherai"
211
211
  Dynamic: author
@@ -21,7 +21,7 @@ test/integration/connectors/test_notion.py,sha256=ueXyVqYWzP4LuZYe6PauptkXNG6qko
21
21
  test/integration/connectors/test_onedrive.py,sha256=iwiDK0kWCfQbIEPnWUzzAA5PiCsHcmFZSxEcIZy_6cc,5229
22
22
  test/integration/connectors/test_pinecone.py,sha256=9FC0frer7gtDzk5A6OhGsV8S4ggYfa5ReEO9t7L3Am0,13649
23
23
  test/integration/connectors/test_qdrant.py,sha256=Yme3ZZ5zIbaZ-yYLUqN2oy0hsrcAfvlleRLYWMSYeSE,8049
24
- test/integration/connectors/test_redis.py,sha256=1aKwOb-K4zCxZwHmgW_WzGJwqLntbWTbpGQ-rtUwN9o,4360
24
+ test/integration/connectors/test_redis.py,sha256=YXWWw4m40ZmLrf3eJ85hhT7WSJnri_GY1ieixIicYlI,5102
25
25
  test/integration/connectors/test_s3.py,sha256=E1dypeag_E3OIfpQWIz3jb7ctRHRD63UtyTrzyvJzpc,7473
26
26
  test/integration/connectors/test_sharepoint.py,sha256=weGby5YD6se7R7KLEq96hxUZYPzwoqZqXXTPhtQWZsQ,7646
27
27
  test/integration/connectors/test_vectara.py,sha256=4kKOOTGUjeZw2jKRcgVDI7ifbRPRZfjjVO4d_7H5C6I,8710
@@ -111,7 +111,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
111
111
  test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
112
  test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
113
113
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
114
- unstructured_ingest/__version__.py,sha256=QYn6GUOSyCz_KH2wi4yg_FlUU4SE844Xhf0hR6-jv8s,43
114
+ unstructured_ingest/__version__.py,sha256=YeBJuoTNGo0rz_5lKoO5e3ooyBOI71QLt4fdSp1KO_c,43
115
115
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
116
116
  unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
117
117
  unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
@@ -446,7 +446,7 @@ unstructured_ingest/v2/processes/connectors/neo4j.py,sha256=I-eDLAlThHKKFQfkZpQL
446
446
  unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=5rg7t40gKxDHNcuJrJHmVzJ9uM7Ct4RBOvFsfwdGc5c,18002
447
447
  unstructured_ingest/v2/processes/connectors/outlook.py,sha256=KgNGM8hImRhy6_SpswRP2VwRD4VOrqqJoySgxf2oduI,9290
448
448
  unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=O9lC4mZ9V_exg9apiCJSWHsgkuYDSEOlI6CaUS5ZB7c,13961
449
- unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=p0AY4ukBNpwAemV4bWzpScvVbLTVlI3DzsCNUKiBI5M,6757
449
+ unstructured_ingest/v2/processes/connectors/redisdb.py,sha256=0h105_MpOO4-uydiyHgM4TvduSkAMAr931KFANcKW8Y,6936
450
450
  unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=2CiO2ZZiZ1Y1-nB7wcDlDVcpW2B7ut9wCj66rkkqho0,11616
451
451
  unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=2T9Bm1H_ALwHhG_YP7vsuUUW-mUg61zcaae3aa9BnN4,4827
452
452
  unstructured_ingest/v2/processes/connectors/slack.py,sha256=Z73VmQ3oUY09KoLEi5OBdQeDt4ONEY_02SglWQc6HXE,9252
@@ -577,9 +577,9 @@ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=DDAYQB7catK
577
577
  unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=R8SXYkRhVUoWEHdGCt2CzcTxxuFundw_0GlGZ34YmbM,8987
578
578
  unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
579
579
  unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
580
- unstructured_ingest-0.5.18.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
581
- unstructured_ingest-0.5.18.dist-info/METADATA,sha256=K47-NP1RfNwqRnvbZ8vO75ab5J5RSmb5nocwSXNwqko,8465
582
- unstructured_ingest-0.5.18.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
583
- unstructured_ingest-0.5.18.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
584
- unstructured_ingest-0.5.18.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
585
- unstructured_ingest-0.5.18.dist-info/RECORD,,
580
+ unstructured_ingest-0.5.19.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
581
+ unstructured_ingest-0.5.19.dist-info/METADATA,sha256=6veKDuElp9klfZfEzZIFIwPCchckH6Mf04qCc0ogN7M,8465
582
+ unstructured_ingest-0.5.19.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
583
+ unstructured_ingest-0.5.19.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
584
+ unstructured_ingest-0.5.19.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
585
+ unstructured_ingest-0.5.19.dist-info/RECORD,,