unstructured-ingest 0.7.1__py3-none-any.whl → 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -1 +1 @@
1
- __version__ = "0.7.1" # pragma: no cover
1
+ __version__ = "0.7.2" # pragma: no cover
@@ -1,99 +1,8 @@
1
- import ast
2
- import json
3
1
  import logging
4
- import typing as t
5
2
 
6
3
  logger = logging.getLogger("unstructured_ingest")
7
4
 
8
5
 
9
- def default_is_data_sensitive(k: str, v: t.Any) -> bool:
10
- sensitive_fields = [
11
- "account_name",
12
- "client_id",
13
- ]
14
- sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
15
- return (
16
- v
17
- and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
18
- or k.lower() in sensitive_fields
19
- )
20
-
21
-
22
- def hide_sensitive_fields(
23
- data: dict, is_sensitive_fn: t.Callable[[str, t.Any], bool] = default_is_data_sensitive
24
- ) -> dict:
25
- """
26
- Will recursively look through every k, v pair in this dict and any nested ones and run
27
- is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
28
- any string value can be parsed as valid json and process that dict as well and replace
29
- the original string with the json.dumps() version of the redacted dict.
30
- """
31
- new_data = data.copy()
32
- for k, v in new_data.items():
33
- if is_sensitive_fn(k, v):
34
- new_data[k] = "*******"
35
- if isinstance(v, dict):
36
- new_data[k] = hide_sensitive_fields(v)
37
- if isinstance(v, str):
38
- # Need to take into account strings generated via json.dumps() or simply printing a dict
39
- try:
40
- json_data = json.loads(v)
41
- if isinstance(json_data, dict):
42
- updated_data = hide_sensitive_fields(json_data)
43
- new_data[k] = json.dumps(updated_data)
44
- except json.JSONDecodeError:
45
- pass
46
-
47
- return new_data
48
-
49
-
50
- def redact_jsons(s: str) -> str:
51
- """
52
- Takes in a generic string and pulls out all valid json content. Leverages
53
- hide_sensitive_fields() to redact any sensitive information and replaces the
54
- original json with the new redacted format. There can be any number of valid
55
- jsons in a generic string and this will work. Having extra '{' without a
56
- closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
57
-
58
- """
59
- chars = list(s)
60
- if "{" not in chars:
61
- return s
62
- i = 0
63
- jsons = []
64
- i = 0
65
- while i < len(chars):
66
- char = chars[i]
67
- if char == "{":
68
- stack = [char]
69
- current = [char]
70
- while len(stack) != 0 and i < len(chars):
71
- i += 1
72
- char = chars[i]
73
- current.append(char)
74
- if char == "{":
75
- stack.append(char)
76
- if char == "}":
77
- stack.pop(-1)
78
- jsons.append("".join(current))
79
- continue
80
- i += 1
81
- for j in jsons:
82
- try:
83
- formatted_j = json.dumps(json.loads(j))
84
- except json.JSONDecodeError:
85
- formatted_j = json.dumps(ast.literal_eval(j))
86
- hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
87
- s = s.replace(j, hidden_j)
88
- return s
89
-
90
-
91
- class SensitiveFormatter(logging.Formatter):
92
- def format(self, record):
93
- s = super().format(record=record)
94
- return redact_jsons(s)
95
-
96
-
97
6
  def remove_root_handlers(logger: logging.Logger) -> None:
98
7
  # NOTE(robinson): in some environments such as Google Colab, there is a root handler
99
8
  # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
@@ -106,7 +15,7 @@ def remove_root_handlers(logger: logging.Logger) -> None:
106
15
  def ingest_log_streaming_init(level: int) -> None:
107
16
  handler = logging.StreamHandler()
108
17
  handler.name = "ingest_log_handler"
109
- formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
18
+ formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
110
19
  handler.setFormatter(formatter)
111
20
 
112
21
  # Only want to add the handler once
@@ -122,7 +31,7 @@ def make_default_logger(level: int) -> logging.Logger:
122
31
  logger = logging.getLogger("unstructured_ingest")
123
32
  handler = logging.StreamHandler()
124
33
  handler.name = "ingest_log_handler"
125
- formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
34
+ formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
126
35
  handler.setFormatter(formatter)
127
36
  logger.addHandler(handler)
128
37
  logger.setLevel(level)
@@ -0,0 +1,10 @@
1
+ CREATE TABLE elements (
2
+ id STRING NOT NULL PRIMARY KEY,
3
+ record_id STRING NOT NULL,
4
+ element_id STRING NOT NULL,
5
+ text STRING,
6
+ embeddings ARRAY<FLOAT>,
7
+ type STRING,
8
+ metadata VARIANT
9
+ );
10
+
@@ -0,0 +1,23 @@
1
+ {
2
+ "properties": [
3
+ {
4
+ "dataType": [
5
+ "text"
6
+ ],
7
+ "indexFilterable": true,
8
+ "indexSearchable": true,
9
+ "name": "record_id",
10
+ "tokenization": "word"
11
+ },
12
+ {
13
+ "dataType": [
14
+ "text"
15
+ ],
16
+ "indexFilterable": true,
17
+ "indexSearchable": true,
18
+ "name": "text",
19
+ "tokenization": "word"
20
+ }
21
+ ],
22
+ "vectorizer": "none"
23
+ }
@@ -1,14 +1,14 @@
1
1
  import json
2
2
  import re
3
- import typing as t
4
3
  from datetime import datetime
4
+ from typing import Any, Union
5
5
 
6
6
  from dateutil import parser
7
7
 
8
8
  from unstructured_ingest.logger import logger
9
9
 
10
10
 
11
- def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
11
+ def json_to_dict(json_string: str) -> Union[str, dict[str, Any]]:
12
12
  """Helper function attempts to deserialize json string to a dictionary."""
13
13
  try:
14
14
  return json.loads(json_string)
@@ -24,7 +24,7 @@ def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
24
24
  return json_string
25
25
 
26
26
 
27
- def ensure_isoformat_datetime(timestamp: t.Union[datetime, str]) -> str:
27
+ def ensure_isoformat_datetime(timestamp: Union[datetime, str]) -> str:
28
28
  """
29
29
  Ensures that the input value is converted to an ISO format datetime string.
30
30
  Handles both datetime objects and strings.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: unstructured-ingest
3
- Version: 0.7.1
3
+ Version: 0.7.2
4
4
  Summary: A library that prepares raw documents for downstream ML tasks.
5
5
  Home-page: https://github.com/Unstructured-IO/unstructured-ingest
6
6
  Author: Unstructured Technologies
@@ -22,349 +22,349 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
22
  Requires-Python: >=3.9.0,<3.14
23
23
  Description-Content-Type: text/markdown
24
24
  License-File: LICENSE.md
25
+ Requires-Dist: tqdm
25
26
  Requires-Dist: opentelemetry-sdk
26
- Requires-Dist: dataclasses_json
27
27
  Requires-Dist: click
28
- Requires-Dist: tqdm
29
- Requires-Dist: pydantic>=2.7
30
28
  Requires-Dist: python-dateutil
31
- Requires-Dist: pandas
29
+ Requires-Dist: pydantic>=2.7
30
+ Requires-Dist: dataclasses_json
32
31
  Requires-Dist: numpy
32
+ Requires-Dist: pandas
33
33
  Provides-Extra: remote
34
34
  Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
35
- Requires-Dist: pandas; extra == "remote"
36
35
  Requires-Dist: numpy; extra == "remote"
36
+ Requires-Dist: pandas; extra == "remote"
37
37
  Provides-Extra: csv
38
38
  Requires-Dist: unstructured[tsv]; extra == "csv"
39
- Requires-Dist: pandas; extra == "csv"
40
39
  Requires-Dist: numpy; extra == "csv"
40
+ Requires-Dist: pandas; extra == "csv"
41
41
  Provides-Extra: doc
42
42
  Requires-Dist: unstructured[docx]; extra == "doc"
43
- Requires-Dist: pandas; extra == "doc"
44
43
  Requires-Dist: numpy; extra == "doc"
44
+ Requires-Dist: pandas; extra == "doc"
45
45
  Provides-Extra: docx
46
46
  Requires-Dist: unstructured[docx]; extra == "docx"
47
- Requires-Dist: pandas; extra == "docx"
48
47
  Requires-Dist: numpy; extra == "docx"
48
+ Requires-Dist: pandas; extra == "docx"
49
49
  Provides-Extra: epub
50
50
  Requires-Dist: unstructured[epub]; extra == "epub"
51
- Requires-Dist: pandas; extra == "epub"
52
51
  Requires-Dist: numpy; extra == "epub"
52
+ Requires-Dist: pandas; extra == "epub"
53
53
  Provides-Extra: md
54
54
  Requires-Dist: unstructured[md]; extra == "md"
55
- Requires-Dist: pandas; extra == "md"
56
55
  Requires-Dist: numpy; extra == "md"
56
+ Requires-Dist: pandas; extra == "md"
57
57
  Provides-Extra: msg
58
58
  Requires-Dist: unstructured[msg]; extra == "msg"
59
- Requires-Dist: pandas; extra == "msg"
60
59
  Requires-Dist: numpy; extra == "msg"
60
+ Requires-Dist: pandas; extra == "msg"
61
61
  Provides-Extra: odt
62
62
  Requires-Dist: unstructured[odt]; extra == "odt"
63
- Requires-Dist: pandas; extra == "odt"
64
63
  Requires-Dist: numpy; extra == "odt"
64
+ Requires-Dist: pandas; extra == "odt"
65
65
  Provides-Extra: org
66
66
  Requires-Dist: unstructured[org]; extra == "org"
67
- Requires-Dist: pandas; extra == "org"
68
67
  Requires-Dist: numpy; extra == "org"
68
+ Requires-Dist: pandas; extra == "org"
69
69
  Provides-Extra: pdf
70
70
  Requires-Dist: unstructured[pdf]; extra == "pdf"
71
- Requires-Dist: pandas; extra == "pdf"
72
71
  Requires-Dist: numpy; extra == "pdf"
72
+ Requires-Dist: pandas; extra == "pdf"
73
73
  Provides-Extra: ppt
74
74
  Requires-Dist: unstructured[pptx]; extra == "ppt"
75
- Requires-Dist: pandas; extra == "ppt"
76
75
  Requires-Dist: numpy; extra == "ppt"
76
+ Requires-Dist: pandas; extra == "ppt"
77
77
  Provides-Extra: pptx
78
78
  Requires-Dist: unstructured[pptx]; extra == "pptx"
79
- Requires-Dist: pandas; extra == "pptx"
80
79
  Requires-Dist: numpy; extra == "pptx"
80
+ Requires-Dist: pandas; extra == "pptx"
81
81
  Provides-Extra: rtf
82
82
  Requires-Dist: unstructured[rtf]; extra == "rtf"
83
- Requires-Dist: pandas; extra == "rtf"
84
83
  Requires-Dist: numpy; extra == "rtf"
84
+ Requires-Dist: pandas; extra == "rtf"
85
85
  Provides-Extra: rst
86
86
  Requires-Dist: unstructured[rst]; extra == "rst"
87
- Requires-Dist: pandas; extra == "rst"
88
87
  Requires-Dist: numpy; extra == "rst"
88
+ Requires-Dist: pandas; extra == "rst"
89
89
  Provides-Extra: tsv
90
90
  Requires-Dist: unstructured[tsv]; extra == "tsv"
91
- Requires-Dist: pandas; extra == "tsv"
92
91
  Requires-Dist: numpy; extra == "tsv"
92
+ Requires-Dist: pandas; extra == "tsv"
93
93
  Provides-Extra: xlsx
94
94
  Requires-Dist: unstructured[xlsx]; extra == "xlsx"
95
- Requires-Dist: pandas; extra == "xlsx"
96
95
  Requires-Dist: numpy; extra == "xlsx"
96
+ Requires-Dist: pandas; extra == "xlsx"
97
97
  Provides-Extra: airtable
98
98
  Requires-Dist: pyairtable; extra == "airtable"
99
- Requires-Dist: pandas; extra == "airtable"
100
99
  Requires-Dist: numpy; extra == "airtable"
100
+ Requires-Dist: pandas; extra == "airtable"
101
101
  Provides-Extra: astradb
102
102
  Requires-Dist: astrapy; extra == "astradb"
103
- Requires-Dist: pandas; extra == "astradb"
104
103
  Requires-Dist: numpy; extra == "astradb"
104
+ Requires-Dist: pandas; extra == "astradb"
105
105
  Provides-Extra: azure
106
- Requires-Dist: fsspec; extra == "azure"
107
106
  Requires-Dist: adlfs; extra == "azure"
108
- Requires-Dist: pandas; extra == "azure"
107
+ Requires-Dist: fsspec; extra == "azure"
109
108
  Requires-Dist: numpy; extra == "azure"
109
+ Requires-Dist: pandas; extra == "azure"
110
110
  Provides-Extra: azure-ai-search
111
111
  Requires-Dist: azure-search-documents; extra == "azure-ai-search"
112
- Requires-Dist: pandas; extra == "azure-ai-search"
113
112
  Requires-Dist: numpy; extra == "azure-ai-search"
113
+ Requires-Dist: pandas; extra == "azure-ai-search"
114
114
  Provides-Extra: biomed
115
- Requires-Dist: bs4; extra == "biomed"
116
115
  Requires-Dist: requests; extra == "biomed"
117
- Requires-Dist: pandas; extra == "biomed"
116
+ Requires-Dist: bs4; extra == "biomed"
118
117
  Requires-Dist: numpy; extra == "biomed"
118
+ Requires-Dist: pandas; extra == "biomed"
119
119
  Provides-Extra: box
120
- Requires-Dist: fsspec; extra == "box"
121
120
  Requires-Dist: boxfs; extra == "box"
122
- Requires-Dist: pandas; extra == "box"
121
+ Requires-Dist: fsspec; extra == "box"
123
122
  Requires-Dist: numpy; extra == "box"
123
+ Requires-Dist: pandas; extra == "box"
124
124
  Provides-Extra: chroma
125
125
  Requires-Dist: chromadb; extra == "chroma"
126
- Requires-Dist: pandas; extra == "chroma"
127
126
  Requires-Dist: numpy; extra == "chroma"
127
+ Requires-Dist: pandas; extra == "chroma"
128
128
  Provides-Extra: clarifai
129
129
  Requires-Dist: clarifai; extra == "clarifai"
130
- Requires-Dist: pandas; extra == "clarifai"
131
130
  Requires-Dist: numpy; extra == "clarifai"
131
+ Requires-Dist: pandas; extra == "clarifai"
132
132
  Provides-Extra: confluence
133
- Requires-Dist: requests; extra == "confluence"
134
133
  Requires-Dist: atlassian-python-api; extra == "confluence"
135
- Requires-Dist: pandas; extra == "confluence"
134
+ Requires-Dist: requests; extra == "confluence"
136
135
  Requires-Dist: numpy; extra == "confluence"
136
+ Requires-Dist: pandas; extra == "confluence"
137
137
  Provides-Extra: couchbase
138
138
  Requires-Dist: couchbase; extra == "couchbase"
139
- Requires-Dist: pandas; extra == "couchbase"
140
139
  Requires-Dist: numpy; extra == "couchbase"
140
+ Requires-Dist: pandas; extra == "couchbase"
141
141
  Provides-Extra: delta-table
142
142
  Requires-Dist: deltalake; extra == "delta-table"
143
143
  Requires-Dist: boto3; extra == "delta-table"
144
- Requires-Dist: pandas; extra == "delta-table"
145
144
  Requires-Dist: numpy; extra == "delta-table"
145
+ Requires-Dist: pandas; extra == "delta-table"
146
146
  Provides-Extra: discord
147
147
  Requires-Dist: discord.py; extra == "discord"
148
- Requires-Dist: pandas; extra == "discord"
149
148
  Requires-Dist: numpy; extra == "discord"
149
+ Requires-Dist: pandas; extra == "discord"
150
150
  Provides-Extra: dropbox
151
151
  Requires-Dist: dropboxdrivefs; extra == "dropbox"
152
152
  Requires-Dist: fsspec; extra == "dropbox"
153
- Requires-Dist: pandas; extra == "dropbox"
154
153
  Requires-Dist: numpy; extra == "dropbox"
154
+ Requires-Dist: pandas; extra == "dropbox"
155
155
  Provides-Extra: duckdb
156
156
  Requires-Dist: duckdb; extra == "duckdb"
157
- Requires-Dist: pandas; extra == "duckdb"
158
157
  Requires-Dist: numpy; extra == "duckdb"
158
+ Requires-Dist: pandas; extra == "duckdb"
159
159
  Provides-Extra: elasticsearch
160
160
  Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
161
- Requires-Dist: pandas; extra == "elasticsearch"
162
161
  Requires-Dist: numpy; extra == "elasticsearch"
162
+ Requires-Dist: pandas; extra == "elasticsearch"
163
163
  Provides-Extra: gcs
164
+ Requires-Dist: gcsfs; extra == "gcs"
164
165
  Requires-Dist: bs4; extra == "gcs"
165
166
  Requires-Dist: fsspec; extra == "gcs"
166
- Requires-Dist: gcsfs; extra == "gcs"
167
- Requires-Dist: pandas; extra == "gcs"
168
167
  Requires-Dist: numpy; extra == "gcs"
168
+ Requires-Dist: pandas; extra == "gcs"
169
169
  Provides-Extra: github
170
170
  Requires-Dist: requests; extra == "github"
171
171
  Requires-Dist: pygithub>1.58.0; extra == "github"
172
- Requires-Dist: pandas; extra == "github"
173
172
  Requires-Dist: numpy; extra == "github"
173
+ Requires-Dist: pandas; extra == "github"
174
174
  Provides-Extra: gitlab
175
175
  Requires-Dist: python-gitlab; extra == "gitlab"
176
- Requires-Dist: pandas; extra == "gitlab"
177
176
  Requires-Dist: numpy; extra == "gitlab"
177
+ Requires-Dist: pandas; extra == "gitlab"
178
178
  Provides-Extra: google-drive
179
179
  Requires-Dist: google-api-python-client; extra == "google-drive"
180
- Requires-Dist: pandas; extra == "google-drive"
181
180
  Requires-Dist: numpy; extra == "google-drive"
181
+ Requires-Dist: pandas; extra == "google-drive"
182
182
  Provides-Extra: hubspot
183
- Requires-Dist: hubspot-api-client; extra == "hubspot"
184
183
  Requires-Dist: urllib3; extra == "hubspot"
185
- Requires-Dist: pandas; extra == "hubspot"
184
+ Requires-Dist: hubspot-api-client; extra == "hubspot"
186
185
  Requires-Dist: numpy; extra == "hubspot"
186
+ Requires-Dist: pandas; extra == "hubspot"
187
187
  Provides-Extra: ibm-watsonx-s3
188
188
  Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
189
189
  Requires-Dist: httpx; extra == "ibm-watsonx-s3"
190
- Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
191
190
  Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
192
- Requires-Dist: pandas; extra == "ibm-watsonx-s3"
191
+ Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
193
192
  Requires-Dist: numpy; extra == "ibm-watsonx-s3"
193
+ Requires-Dist: pandas; extra == "ibm-watsonx-s3"
194
194
  Provides-Extra: jira
195
195
  Requires-Dist: atlassian-python-api; extra == "jira"
196
- Requires-Dist: pandas; extra == "jira"
197
196
  Requires-Dist: numpy; extra == "jira"
197
+ Requires-Dist: pandas; extra == "jira"
198
198
  Provides-Extra: kafka
199
199
  Requires-Dist: confluent-kafka; extra == "kafka"
200
- Requires-Dist: pandas; extra == "kafka"
201
200
  Requires-Dist: numpy; extra == "kafka"
201
+ Requires-Dist: pandas; extra == "kafka"
202
202
  Provides-Extra: kdbai
203
203
  Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
204
- Requires-Dist: pandas; extra == "kdbai"
205
204
  Requires-Dist: numpy; extra == "kdbai"
205
+ Requires-Dist: pandas; extra == "kdbai"
206
206
  Provides-Extra: lancedb
207
207
  Requires-Dist: lancedb; extra == "lancedb"
208
- Requires-Dist: pandas; extra == "lancedb"
209
208
  Requires-Dist: numpy; extra == "lancedb"
209
+ Requires-Dist: pandas; extra == "lancedb"
210
210
  Provides-Extra: milvus
211
211
  Requires-Dist: pymilvus; extra == "milvus"
212
- Requires-Dist: pandas; extra == "milvus"
213
212
  Requires-Dist: numpy; extra == "milvus"
213
+ Requires-Dist: pandas; extra == "milvus"
214
214
  Provides-Extra: mongodb
215
215
  Requires-Dist: pymongo; extra == "mongodb"
216
- Requires-Dist: pandas; extra == "mongodb"
217
216
  Requires-Dist: numpy; extra == "mongodb"
217
+ Requires-Dist: pandas; extra == "mongodb"
218
218
  Provides-Extra: neo4j
219
- Requires-Dist: cymple; extra == "neo4j"
220
219
  Requires-Dist: neo4j-rust-ext; extra == "neo4j"
221
220
  Requires-Dist: networkx; extra == "neo4j"
222
- Requires-Dist: pandas; extra == "neo4j"
221
+ Requires-Dist: cymple; extra == "neo4j"
223
222
  Requires-Dist: numpy; extra == "neo4j"
223
+ Requires-Dist: pandas; extra == "neo4j"
224
224
  Provides-Extra: notion
225
225
  Requires-Dist: notion-client; extra == "notion"
226
- Requires-Dist: backoff; extra == "notion"
227
226
  Requires-Dist: httpx; extra == "notion"
227
+ Requires-Dist: backoff; extra == "notion"
228
228
  Requires-Dist: htmlBuilder; extra == "notion"
229
- Requires-Dist: pandas; extra == "notion"
230
229
  Requires-Dist: numpy; extra == "notion"
230
+ Requires-Dist: pandas; extra == "notion"
231
231
  Provides-Extra: onedrive
232
- Requires-Dist: msal; extra == "onedrive"
233
- Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
234
232
  Requires-Dist: requests; extra == "onedrive"
235
- Requires-Dist: pandas; extra == "onedrive"
233
+ Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
234
+ Requires-Dist: msal; extra == "onedrive"
236
235
  Requires-Dist: numpy; extra == "onedrive"
236
+ Requires-Dist: pandas; extra == "onedrive"
237
237
  Provides-Extra: opensearch
238
238
  Requires-Dist: opensearch-py; extra == "opensearch"
239
- Requires-Dist: pandas; extra == "opensearch"
240
239
  Requires-Dist: numpy; extra == "opensearch"
240
+ Requires-Dist: pandas; extra == "opensearch"
241
241
  Provides-Extra: outlook
242
- Requires-Dist: msal; extra == "outlook"
243
242
  Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
244
- Requires-Dist: pandas; extra == "outlook"
243
+ Requires-Dist: msal; extra == "outlook"
245
244
  Requires-Dist: numpy; extra == "outlook"
245
+ Requires-Dist: pandas; extra == "outlook"
246
246
  Provides-Extra: pinecone
247
247
  Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
248
- Requires-Dist: pandas; extra == "pinecone"
249
248
  Requires-Dist: numpy; extra == "pinecone"
249
+ Requires-Dist: pandas; extra == "pinecone"
250
250
  Provides-Extra: postgres
251
251
  Requires-Dist: psycopg2-binary; extra == "postgres"
252
- Requires-Dist: pandas; extra == "postgres"
253
252
  Requires-Dist: numpy; extra == "postgres"
253
+ Requires-Dist: pandas; extra == "postgres"
254
254
  Provides-Extra: qdrant
255
255
  Requires-Dist: qdrant-client; extra == "qdrant"
256
- Requires-Dist: pandas; extra == "qdrant"
257
256
  Requires-Dist: numpy; extra == "qdrant"
257
+ Requires-Dist: pandas; extra == "qdrant"
258
258
  Provides-Extra: reddit
259
259
  Requires-Dist: praw; extra == "reddit"
260
- Requires-Dist: pandas; extra == "reddit"
261
260
  Requires-Dist: numpy; extra == "reddit"
261
+ Requires-Dist: pandas; extra == "reddit"
262
262
  Provides-Extra: redis
263
263
  Requires-Dist: redis; extra == "redis"
264
- Requires-Dist: pandas; extra == "redis"
265
264
  Requires-Dist: numpy; extra == "redis"
265
+ Requires-Dist: pandas; extra == "redis"
266
266
  Provides-Extra: s3
267
- Requires-Dist: fsspec; extra == "s3"
268
267
  Requires-Dist: s3fs; extra == "s3"
269
- Requires-Dist: pandas; extra == "s3"
268
+ Requires-Dist: fsspec; extra == "s3"
270
269
  Requires-Dist: numpy; extra == "s3"
270
+ Requires-Dist: pandas; extra == "s3"
271
271
  Provides-Extra: sharepoint
272
- Requires-Dist: msal; extra == "sharepoint"
273
- Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
274
272
  Requires-Dist: requests; extra == "sharepoint"
275
- Requires-Dist: pandas; extra == "sharepoint"
273
+ Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
274
+ Requires-Dist: msal; extra == "sharepoint"
276
275
  Requires-Dist: numpy; extra == "sharepoint"
276
+ Requires-Dist: pandas; extra == "sharepoint"
277
277
  Provides-Extra: salesforce
278
278
  Requires-Dist: simple-salesforce; extra == "salesforce"
279
- Requires-Dist: pandas; extra == "salesforce"
280
279
  Requires-Dist: numpy; extra == "salesforce"
280
+ Requires-Dist: pandas; extra == "salesforce"
281
281
  Provides-Extra: sftp
282
282
  Requires-Dist: paramiko; extra == "sftp"
283
283
  Requires-Dist: fsspec; extra == "sftp"
284
- Requires-Dist: pandas; extra == "sftp"
285
284
  Requires-Dist: numpy; extra == "sftp"
285
+ Requires-Dist: pandas; extra == "sftp"
286
286
  Provides-Extra: slack
287
287
  Requires-Dist: slack_sdk[optional]; extra == "slack"
288
- Requires-Dist: pandas; extra == "slack"
289
288
  Requires-Dist: numpy; extra == "slack"
289
+ Requires-Dist: pandas; extra == "slack"
290
290
  Provides-Extra: snowflake
291
291
  Requires-Dist: snowflake-connector-python; extra == "snowflake"
292
292
  Requires-Dist: psycopg2-binary; extra == "snowflake"
293
- Requires-Dist: pandas; extra == "snowflake"
294
293
  Requires-Dist: numpy; extra == "snowflake"
294
+ Requires-Dist: pandas; extra == "snowflake"
295
295
  Provides-Extra: wikipedia
296
296
  Requires-Dist: wikipedia; extra == "wikipedia"
297
- Requires-Dist: pandas; extra == "wikipedia"
298
297
  Requires-Dist: numpy; extra == "wikipedia"
298
+ Requires-Dist: pandas; extra == "wikipedia"
299
299
  Provides-Extra: weaviate
300
300
  Requires-Dist: weaviate-client; extra == "weaviate"
301
- Requires-Dist: pandas; extra == "weaviate"
302
301
  Requires-Dist: numpy; extra == "weaviate"
302
+ Requires-Dist: pandas; extra == "weaviate"
303
303
  Provides-Extra: databricks-volumes
304
304
  Requires-Dist: databricks-sdk; extra == "databricks-volumes"
305
- Requires-Dist: pandas; extra == "databricks-volumes"
306
305
  Requires-Dist: numpy; extra == "databricks-volumes"
306
+ Requires-Dist: pandas; extra == "databricks-volumes"
307
307
  Provides-Extra: databricks-delta-tables
308
308
  Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
309
- Requires-Dist: pandas; extra == "databricks-delta-tables"
310
309
  Requires-Dist: numpy; extra == "databricks-delta-tables"
310
+ Requires-Dist: pandas; extra == "databricks-delta-tables"
311
311
  Provides-Extra: singlestore
312
312
  Requires-Dist: singlestoredb; extra == "singlestore"
313
- Requires-Dist: pandas; extra == "singlestore"
314
313
  Requires-Dist: numpy; extra == "singlestore"
314
+ Requires-Dist: pandas; extra == "singlestore"
315
315
  Provides-Extra: vectara
316
316
  Requires-Dist: requests; extra == "vectara"
317
317
  Requires-Dist: httpx; extra == "vectara"
318
318
  Requires-Dist: aiofiles; extra == "vectara"
319
- Requires-Dist: pandas; extra == "vectara"
320
319
  Requires-Dist: numpy; extra == "vectara"
320
+ Requires-Dist: pandas; extra == "vectara"
321
321
  Provides-Extra: vastdb
322
322
  Requires-Dist: ibis; extra == "vastdb"
323
323
  Requires-Dist: vastdb; extra == "vastdb"
324
324
  Requires-Dist: pyarrow; extra == "vastdb"
325
- Requires-Dist: pandas; extra == "vastdb"
326
325
  Requires-Dist: numpy; extra == "vastdb"
326
+ Requires-Dist: pandas; extra == "vastdb"
327
327
  Provides-Extra: zendesk
328
+ Requires-Dist: aiofiles; extra == "zendesk"
328
329
  Requires-Dist: bs4; extra == "zendesk"
329
330
  Requires-Dist: httpx; extra == "zendesk"
330
- Requires-Dist: aiofiles; extra == "zendesk"
331
- Requires-Dist: pandas; extra == "zendesk"
332
331
  Requires-Dist: numpy; extra == "zendesk"
332
+ Requires-Dist: pandas; extra == "zendesk"
333
333
  Provides-Extra: embed-huggingface
334
334
  Requires-Dist: sentence-transformers; extra == "embed-huggingface"
335
- Requires-Dist: pandas; extra == "embed-huggingface"
336
335
  Requires-Dist: numpy; extra == "embed-huggingface"
336
+ Requires-Dist: pandas; extra == "embed-huggingface"
337
337
  Provides-Extra: embed-octoai
338
- Requires-Dist: tiktoken; extra == "embed-octoai"
339
338
  Requires-Dist: openai; extra == "embed-octoai"
340
- Requires-Dist: pandas; extra == "embed-octoai"
339
+ Requires-Dist: tiktoken; extra == "embed-octoai"
341
340
  Requires-Dist: numpy; extra == "embed-octoai"
341
+ Requires-Dist: pandas; extra == "embed-octoai"
342
342
  Provides-Extra: embed-vertexai
343
343
  Requires-Dist: vertexai; extra == "embed-vertexai"
344
- Requires-Dist: pandas; extra == "embed-vertexai"
345
344
  Requires-Dist: numpy; extra == "embed-vertexai"
345
+ Requires-Dist: pandas; extra == "embed-vertexai"
346
346
  Provides-Extra: embed-voyageai
347
347
  Requires-Dist: voyageai; extra == "embed-voyageai"
348
- Requires-Dist: pandas; extra == "embed-voyageai"
349
348
  Requires-Dist: numpy; extra == "embed-voyageai"
349
+ Requires-Dist: pandas; extra == "embed-voyageai"
350
350
  Provides-Extra: embed-mixedbreadai
351
351
  Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
352
- Requires-Dist: pandas; extra == "embed-mixedbreadai"
353
352
  Requires-Dist: numpy; extra == "embed-mixedbreadai"
353
+ Requires-Dist: pandas; extra == "embed-mixedbreadai"
354
354
  Provides-Extra: openai
355
- Requires-Dist: tiktoken; extra == "openai"
356
355
  Requires-Dist: openai; extra == "openai"
357
- Requires-Dist: pandas; extra == "openai"
356
+ Requires-Dist: tiktoken; extra == "openai"
358
357
  Requires-Dist: numpy; extra == "openai"
358
+ Requires-Dist: pandas; extra == "openai"
359
359
  Provides-Extra: bedrock
360
- Requires-Dist: boto3; extra == "bedrock"
361
360
  Requires-Dist: aioboto3; extra == "bedrock"
362
- Requires-Dist: pandas; extra == "bedrock"
361
+ Requires-Dist: boto3; extra == "bedrock"
363
362
  Requires-Dist: numpy; extra == "bedrock"
363
+ Requires-Dist: pandas; extra == "bedrock"
364
364
  Provides-Extra: togetherai
365
365
  Requires-Dist: together; extra == "togetherai"
366
- Requires-Dist: pandas; extra == "togetherai"
367
366
  Requires-Dist: numpy; extra == "togetherai"
367
+ Requires-Dist: pandas; extra == "togetherai"
368
368
  Dynamic: author
369
369
  Dynamic: author-email
370
370
  Dynamic: classifier
@@ -101,7 +101,6 @@ test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
101
  test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
102
102
  test/unit/test_html.py,sha256=ubsck9pVOnPDFL0P8TZkko_46MIaFLlSNQcsgFDgYoE,4496
103
103
  test/unit/test_interfaces.py,sha256=Gv3WMJsw_3xPLy3nI3dIcJuLa2WvKYszSjI_W9XLtVM,787
104
- test/unit/test_logger.py,sha256=0SKndXE_VRd8XmUHkrj7zuBQHZscXx3ZQllMEOvtF9Y,2380
105
104
  test/unit/test_utils.py,sha256=xeSM02zOChSOO3dzDOVAEiQme1rQ8drjnJF93S3BFmk,7247
106
105
  test/unit/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
107
106
  test/unit/chunkers/test_chunkers.py,sha256=wRxbSj7P1FwRGDyVcARkm8CQSVCBCro3nTe54UoUBzc,1769
@@ -134,10 +133,10 @@ test/unit/partitioners/test_partitioner.py,sha256=eJoUDbiKtweyU1WYfsY5KqVqoPjbx1
134
133
  test/unit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
135
134
  test/unit/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
136
135
  unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
137
- unstructured_ingest/__version__.py,sha256=YP5dlQlaTHZ-KOck8o_UzdjIFae7iENB5d3AMIKlZ3M,42
136
+ unstructured_ingest/__version__.py,sha256=7O8GlC09PP-XuUDOj6bhRUtbOuUgpBT2COw4AjU1kk0,42
138
137
  unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
139
138
  unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
140
- unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
139
+ unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
141
140
  unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
142
141
  unstructured_ingest/otel.py,sha256=NsUqOolA0gt69eFhZLABjVpcKoM9aus-AbxIKqWqPTc,4127
143
142
  unstructured_ingest/unstructured_api.py,sha256=hWUXUhGtyfi2OcDR-BriHJyT4jJywf4zfG1qpSCf9Bo,5002
@@ -222,6 +221,8 @@ unstructured_ingest/processes/connectors/slack.py,sha256=e4ntATdht_olAPsco1DKwlr
222
221
  unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
223
222
  unstructured_ingest/processes/connectors/vectara.py,sha256=frKJkc7ffstQhXD9-HkAGoQAofGkl6AsnKJhGcl8LgA,12294
224
223
  unstructured_ingest/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
224
+ unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=dUZZDNkyvQXKqoAThRz3ek7zaUE2l_LAQimlG5WZhH4,211
225
+ unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
225
226
  unstructured_ingest/processes/connectors/databricks/__init__.py,sha256=RtKAPyNtXh6fzEsOQ08pA0-vC1uMr3KqYG6cqiBoo70,2133
226
227
  unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=OWQrne9-5hPzc-kxGa2P53M3DoksDzMDyjLhQyihdCo,8020
227
228
  unstructured_ingest/processes/connectors/databricks/volumes_aws.py,sha256=RP9rq2sfysygiqzXj6eX0CXeZpxk65xmrz7HZnWRQWA,2961
@@ -360,11 +361,11 @@ unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWh
360
361
  unstructured_ingest/utils/html.py,sha256=0WduP8tI5S3nHFQi6XHNPHgsIC9j3iWwyIayX9gDLiE,6386
361
362
  unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
362
363
  unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
363
- unstructured_ingest/utils/string_and_date_utils.py,sha256=QBj8HXZGvDZQSULLOQwJ8tb3r2aYrTBQ71rkiV6gZdI,2519
364
+ unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
364
365
  unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
365
- unstructured_ingest-0.7.1.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
366
- unstructured_ingest-0.7.1.dist-info/METADATA,sha256=Ex_8EkItZzbGEoRJwR7Fqm_t0aajIZLVdtzwL7XBsQw,15050
367
- unstructured_ingest-0.7.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
368
- unstructured_ingest-0.7.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
369
- unstructured_ingest-0.7.1.dist-info/top_level.txt,sha256=85vUyT6fV2A5eCEM3M3FPRUUI9vZOVK1xVZt7eo1oV8,34
370
- unstructured_ingest-0.7.1.dist-info/RECORD,,
366
+ unstructured_ingest-0.7.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
367
+ unstructured_ingest-0.7.2.dist-info/METADATA,sha256=BjJRt_WKMPbiOWOxGZPs3Q9ZmwHRkPfF0FbWT7X7lA4,15050
368
+ unstructured_ingest-0.7.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
369
+ unstructured_ingest-0.7.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
370
+ unstructured_ingest-0.7.2.dist-info/top_level.txt,sha256=85vUyT6fV2A5eCEM3M3FPRUUI9vZOVK1xVZt7eo1oV8,34
371
+ unstructured_ingest-0.7.2.dist-info/RECORD,,
test/unit/test_logger.py DELETED
@@ -1,78 +0,0 @@
1
- import json
2
-
3
- import pytest
4
-
5
- from unstructured_ingest.logger import (
6
- default_is_data_sensitive,
7
- hide_sensitive_fields,
8
- redact_jsons,
9
- )
10
-
11
-
12
- @pytest.mark.parametrize(
13
- ("key", "value", "is_sensitive"),
14
- [
15
- ("username", "john_smith", False),
16
- ("password", "13?H%", True),
17
- ("token", "123", True),
18
- ("AWS_CREDENTIAL", "aws_credential", True),
19
- ("AWS_KEY", None, False),
20
- ],
21
- )
22
- def test_default_is_sensitive(key, value, is_sensitive):
23
- assert default_is_data_sensitive(key, value) == is_sensitive
24
-
25
-
26
- def test_hide_sensitive_fields():
27
- d = {
28
- "username": "john_smith",
29
- "password": "13?H%",
30
- "inner": {
31
- "token": "123",
32
- "AWS_KEY": None,
33
- "inner_j_string": json.dumps(
34
- {"account_name": "secret name", "client_id": 123, "timestamp": 123}
35
- ),
36
- },
37
- }
38
- redacted_d = hide_sensitive_fields(d)
39
- expected_d = {
40
- "password": "*******",
41
- "username": "john_smith",
42
- "inner": {
43
- "token": "*******",
44
- "AWS_KEY": None,
45
- "inner_j_string": json.dumps(
46
- {"account_name": "*******", "client_id": "*******", "timestamp": 123}
47
- ),
48
- },
49
- }
50
- assert redacted_d == expected_d
51
-
52
-
53
- def test_redact_jsons():
54
- d1 = {
55
- "username": "john_smith",
56
- "password": "13?H%",
57
- "inner": {
58
- "token": "123",
59
- "AWS_KEY": None,
60
- "inner_j_string": json.dumps(
61
- {"account_name": "secret name", "client_id": 123, "timestamp": 123}
62
- ),
63
- },
64
- }
65
-
66
- d2 = {"username": "tim67", "update_time": 456}
67
- d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
68
-
69
- sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
70
- expected_string = (
71
- 'Some topic secret info ({"username": "john_smith", "password": "*******", '
72
- '"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
73
- '"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
74
- '\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
75
- 'and {"account_name": "*******", "host": "http://localhost:8888"})'
76
- )
77
- redacted_string = redact_jsons(sensitive_string)
78
- assert redacted_string == expected_string