unstructured-ingest 0.7.0__py3-none-any.whl → 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/logger.py +2 -93
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/processes/connectors/onedrive.py +45 -17
- unstructured_ingest/processes/connectors/sharepoint.py +1 -1
- unstructured_ingest/utils/string_and_date_utils.py +3 -3
- {unstructured_ingest-0.7.0.dist-info → unstructured_ingest-0.7.2.dist-info}/METADATA +29 -28
- {unstructured_ingest-0.7.0.dist-info → unstructured_ingest-0.7.2.dist-info}/RECORD +13 -12
- test/unit/test_logger.py +0 -78
- {unstructured_ingest-0.7.0.dist-info → unstructured_ingest-0.7.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.7.0.dist-info → unstructured_ingest-0.7.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.7.0.dist-info → unstructured_ingest-0.7.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.0.dist-info → unstructured_ingest-0.7.2.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.7.
|
|
1
|
+
__version__ = "0.7.2" # pragma: no cover
|
unstructured_ingest/logger.py
CHANGED
|
@@ -1,99 +1,8 @@
|
|
|
1
|
-
import ast
|
|
2
|
-
import json
|
|
3
1
|
import logging
|
|
4
|
-
import typing as t
|
|
5
2
|
|
|
6
3
|
logger = logging.getLogger("unstructured_ingest")
|
|
7
4
|
|
|
8
5
|
|
|
9
|
-
def default_is_data_sensitive(k: str, v: t.Any) -> bool:
|
|
10
|
-
sensitive_fields = [
|
|
11
|
-
"account_name",
|
|
12
|
-
"client_id",
|
|
13
|
-
]
|
|
14
|
-
sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
|
|
15
|
-
return (
|
|
16
|
-
v
|
|
17
|
-
and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
|
|
18
|
-
or k.lower() in sensitive_fields
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def hide_sensitive_fields(
|
|
23
|
-
data: dict, is_sensitive_fn: t.Callable[[str, t.Any], bool] = default_is_data_sensitive
|
|
24
|
-
) -> dict:
|
|
25
|
-
"""
|
|
26
|
-
Will recursively look through every k, v pair in this dict and any nested ones and run
|
|
27
|
-
is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
|
|
28
|
-
any string value can be parsed as valid json and process that dict as well and replace
|
|
29
|
-
the original string with the json.dumps() version of the redacted dict.
|
|
30
|
-
"""
|
|
31
|
-
new_data = data.copy()
|
|
32
|
-
for k, v in new_data.items():
|
|
33
|
-
if is_sensitive_fn(k, v):
|
|
34
|
-
new_data[k] = "*******"
|
|
35
|
-
if isinstance(v, dict):
|
|
36
|
-
new_data[k] = hide_sensitive_fields(v)
|
|
37
|
-
if isinstance(v, str):
|
|
38
|
-
# Need to take into account strings generated via json.dumps() or simply printing a dict
|
|
39
|
-
try:
|
|
40
|
-
json_data = json.loads(v)
|
|
41
|
-
if isinstance(json_data, dict):
|
|
42
|
-
updated_data = hide_sensitive_fields(json_data)
|
|
43
|
-
new_data[k] = json.dumps(updated_data)
|
|
44
|
-
except json.JSONDecodeError:
|
|
45
|
-
pass
|
|
46
|
-
|
|
47
|
-
return new_data
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def redact_jsons(s: str) -> str:
|
|
51
|
-
"""
|
|
52
|
-
Takes in a generic string and pulls out all valid json content. Leverages
|
|
53
|
-
hide_sensitive_fields() to redact any sensitive information and replaces the
|
|
54
|
-
original json with the new redacted format. There can be any number of valid
|
|
55
|
-
jsons in a generic string and this will work. Having extra '{' without a
|
|
56
|
-
closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
|
|
57
|
-
|
|
58
|
-
"""
|
|
59
|
-
chars = list(s)
|
|
60
|
-
if "{" not in chars:
|
|
61
|
-
return s
|
|
62
|
-
i = 0
|
|
63
|
-
jsons = []
|
|
64
|
-
i = 0
|
|
65
|
-
while i < len(chars):
|
|
66
|
-
char = chars[i]
|
|
67
|
-
if char == "{":
|
|
68
|
-
stack = [char]
|
|
69
|
-
current = [char]
|
|
70
|
-
while len(stack) != 0 and i < len(chars):
|
|
71
|
-
i += 1
|
|
72
|
-
char = chars[i]
|
|
73
|
-
current.append(char)
|
|
74
|
-
if char == "{":
|
|
75
|
-
stack.append(char)
|
|
76
|
-
if char == "}":
|
|
77
|
-
stack.pop(-1)
|
|
78
|
-
jsons.append("".join(current))
|
|
79
|
-
continue
|
|
80
|
-
i += 1
|
|
81
|
-
for j in jsons:
|
|
82
|
-
try:
|
|
83
|
-
formatted_j = json.dumps(json.loads(j))
|
|
84
|
-
except json.JSONDecodeError:
|
|
85
|
-
formatted_j = json.dumps(ast.literal_eval(j))
|
|
86
|
-
hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
|
|
87
|
-
s = s.replace(j, hidden_j)
|
|
88
|
-
return s
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
class SensitiveFormatter(logging.Formatter):
|
|
92
|
-
def format(self, record):
|
|
93
|
-
s = super().format(record=record)
|
|
94
|
-
return redact_jsons(s)
|
|
95
|
-
|
|
96
|
-
|
|
97
6
|
def remove_root_handlers(logger: logging.Logger) -> None:
|
|
98
7
|
# NOTE(robinson): in some environments such as Google Colab, there is a root handler
|
|
99
8
|
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
@@ -106,7 +15,7 @@ def remove_root_handlers(logger: logging.Logger) -> None:
|
|
|
106
15
|
def ingest_log_streaming_init(level: int) -> None:
|
|
107
16
|
handler = logging.StreamHandler()
|
|
108
17
|
handler.name = "ingest_log_handler"
|
|
109
|
-
formatter =
|
|
18
|
+
formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
110
19
|
handler.setFormatter(formatter)
|
|
111
20
|
|
|
112
21
|
# Only want to add the handler once
|
|
@@ -122,7 +31,7 @@ def make_default_logger(level: int) -> logging.Logger:
|
|
|
122
31
|
logger = logging.getLogger("unstructured_ingest")
|
|
123
32
|
handler = logging.StreamHandler()
|
|
124
33
|
handler.name = "ingest_log_handler"
|
|
125
|
-
formatter =
|
|
34
|
+
formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
126
35
|
handler.setFormatter(formatter)
|
|
127
36
|
logger.addHandler(handler)
|
|
128
37
|
logger.setLevel(level)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"properties": [
|
|
3
|
+
{
|
|
4
|
+
"dataType": [
|
|
5
|
+
"text"
|
|
6
|
+
],
|
|
7
|
+
"indexFilterable": true,
|
|
8
|
+
"indexSearchable": true,
|
|
9
|
+
"name": "record_id",
|
|
10
|
+
"tokenization": "word"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"dataType": [
|
|
14
|
+
"text"
|
|
15
|
+
],
|
|
16
|
+
"indexFilterable": true,
|
|
17
|
+
"indexSearchable": true,
|
|
18
|
+
"name": "text",
|
|
19
|
+
"tokenization": "word"
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"vectorizer": "none"
|
|
23
|
+
}
|
|
@@ -53,11 +53,14 @@ MAX_BYTES_SIZE = 512_000_000
|
|
|
53
53
|
|
|
54
54
|
class OnedriveAccessConfig(AccessConfig):
|
|
55
55
|
client_cred: str = Field(description="Microsoft App client secret")
|
|
56
|
+
password: Optional[str] = Field(description="Service account password", default=None)
|
|
56
57
|
|
|
57
58
|
|
|
58
59
|
class OnedriveConnectionConfig(ConnectionConfig):
|
|
59
60
|
client_id: str = Field(description="Microsoft app client ID")
|
|
60
|
-
user_pname: str = Field(
|
|
61
|
+
user_pname: str = Field(
|
|
62
|
+
description="User principal name or service account, usually your Azure AD email."
|
|
63
|
+
)
|
|
61
64
|
tenant: str = Field(
|
|
62
65
|
repr=False, description="ID or domain name associated with your Azure AD instance"
|
|
63
66
|
)
|
|
@@ -74,25 +77,50 @@ class OnedriveConnectionConfig(ConnectionConfig):
|
|
|
74
77
|
drive = client.users[self.user_pname].drive
|
|
75
78
|
return drive
|
|
76
79
|
|
|
77
|
-
@requires_dependencies(["msal"], extras="onedrive")
|
|
80
|
+
@requires_dependencies(["msal", "requests"], extras="onedrive")
|
|
78
81
|
def get_token(self):
|
|
79
82
|
from msal import ConfidentialClientApplication
|
|
83
|
+
from requests import post
|
|
84
|
+
|
|
85
|
+
if self.access_config.get_secret_value().password:
|
|
86
|
+
url = f"https://login.microsoftonline.com/{self.tenant}/oauth2/v2.0/token"
|
|
87
|
+
headers = {"Content-Type": "application/x-www-form-urlencoded"}
|
|
88
|
+
data = {
|
|
89
|
+
"grant_type": "password",
|
|
90
|
+
"username": self.user_pname,
|
|
91
|
+
"password": self.access_config.get_secret_value().password,
|
|
92
|
+
"client_id": self.client_id,
|
|
93
|
+
"client_secret": self.access_config.get_secret_value().client_cred,
|
|
94
|
+
"scope": "https://graph.microsoft.com/.default",
|
|
95
|
+
}
|
|
96
|
+
response = post(url, headers=headers, data=data)
|
|
97
|
+
if response.status_code == 200:
|
|
98
|
+
return response.json()
|
|
99
|
+
else:
|
|
100
|
+
raise SourceConnectionError(
|
|
101
|
+
f"Oauth2 authentication failed with {response.status_code}: {response.text}"
|
|
102
|
+
)
|
|
80
103
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
104
|
+
else:
|
|
105
|
+
try:
|
|
106
|
+
app = ConfidentialClientApplication(
|
|
107
|
+
authority=f"{self.authority_url}/{self.tenant}",
|
|
108
|
+
client_id=self.client_id,
|
|
109
|
+
client_credential=self.access_config.get_secret_value().client_cred,
|
|
110
|
+
)
|
|
111
|
+
token = app.acquire_token_for_client(
|
|
112
|
+
scopes=["https://graph.microsoft.com/.default"]
|
|
113
|
+
)
|
|
114
|
+
except ValueError as exc:
|
|
115
|
+
logger.error("Couldn't set up credentials.")
|
|
116
|
+
raise exc
|
|
117
|
+
if "error" in token:
|
|
118
|
+
raise SourceConnectionNetworkError(
|
|
119
|
+
"failed to fetch token, {}: {}".format(
|
|
120
|
+
token["error"], token["error_description"]
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
return token
|
|
96
124
|
|
|
97
125
|
@requires_dependencies(["office365"], extras="onedrive")
|
|
98
126
|
def get_client(self) -> "GraphClient":
|
|
@@ -100,7 +100,7 @@ class SharepointDownloader(OnedriveDownloader):
|
|
|
100
100
|
connector_type: str = CONNECTOR_TYPE
|
|
101
101
|
|
|
102
102
|
@SourceConnectionNetworkError.wrap
|
|
103
|
-
@requires_dependencies(["office365"], extras="
|
|
103
|
+
@requires_dependencies(["office365"], extras="sharepoint")
|
|
104
104
|
def _fetch_file(self, file_data: FileData) -> DriveItem:
|
|
105
105
|
from office365.runtime.client_request_exception import ClientRequestException
|
|
106
106
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import re
|
|
3
|
-
import typing as t
|
|
4
3
|
from datetime import datetime
|
|
4
|
+
from typing import Any, Union
|
|
5
5
|
|
|
6
6
|
from dateutil import parser
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.logger import logger
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def json_to_dict(json_string: str) ->
|
|
11
|
+
def json_to_dict(json_string: str) -> Union[str, dict[str, Any]]:
|
|
12
12
|
"""Helper function attempts to deserialize json string to a dictionary."""
|
|
13
13
|
try:
|
|
14
14
|
return json.loads(json_string)
|
|
@@ -24,7 +24,7 @@ def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
|
|
|
24
24
|
return json_string
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
def ensure_isoformat_datetime(timestamp:
|
|
27
|
+
def ensure_isoformat_datetime(timestamp: Union[datetime, str]) -> str:
|
|
28
28
|
"""
|
|
29
29
|
Ensures that the input value is converted to an ISO format datetime string.
|
|
30
30
|
Handles both datetime objects and strings.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,11 +22,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
+
Requires-Dist: tqdm
|
|
26
|
+
Requires-Dist: opentelemetry-sdk
|
|
25
27
|
Requires-Dist: click
|
|
26
28
|
Requires-Dist: python-dateutil
|
|
27
29
|
Requires-Dist: pydantic>=2.7
|
|
28
|
-
Requires-Dist: opentelemetry-sdk
|
|
29
|
-
Requires-Dist: tqdm
|
|
30
30
|
Requires-Dist: dataclasses_json
|
|
31
31
|
Requires-Dist: numpy
|
|
32
32
|
Requires-Dist: pandas
|
|
@@ -103,8 +103,8 @@ Requires-Dist: astrapy; extra == "astradb"
|
|
|
103
103
|
Requires-Dist: numpy; extra == "astradb"
|
|
104
104
|
Requires-Dist: pandas; extra == "astradb"
|
|
105
105
|
Provides-Extra: azure
|
|
106
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
107
106
|
Requires-Dist: adlfs; extra == "azure"
|
|
107
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
108
108
|
Requires-Dist: numpy; extra == "azure"
|
|
109
109
|
Requires-Dist: pandas; extra == "azure"
|
|
110
110
|
Provides-Extra: azure-ai-search
|
|
@@ -112,8 +112,8 @@ Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
|
112
112
|
Requires-Dist: numpy; extra == "azure-ai-search"
|
|
113
113
|
Requires-Dist: pandas; extra == "azure-ai-search"
|
|
114
114
|
Provides-Extra: biomed
|
|
115
|
-
Requires-Dist: bs4; extra == "biomed"
|
|
116
115
|
Requires-Dist: requests; extra == "biomed"
|
|
116
|
+
Requires-Dist: bs4; extra == "biomed"
|
|
117
117
|
Requires-Dist: numpy; extra == "biomed"
|
|
118
118
|
Requires-Dist: pandas; extra == "biomed"
|
|
119
119
|
Provides-Extra: box
|
|
@@ -139,8 +139,8 @@ Requires-Dist: couchbase; extra == "couchbase"
|
|
|
139
139
|
Requires-Dist: numpy; extra == "couchbase"
|
|
140
140
|
Requires-Dist: pandas; extra == "couchbase"
|
|
141
141
|
Provides-Extra: delta-table
|
|
142
|
-
Requires-Dist: boto3; extra == "delta-table"
|
|
143
142
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
143
|
+
Requires-Dist: boto3; extra == "delta-table"
|
|
144
144
|
Requires-Dist: numpy; extra == "delta-table"
|
|
145
145
|
Requires-Dist: pandas; extra == "delta-table"
|
|
146
146
|
Provides-Extra: discord
|
|
@@ -148,8 +148,8 @@ Requires-Dist: discord.py; extra == "discord"
|
|
|
148
148
|
Requires-Dist: numpy; extra == "discord"
|
|
149
149
|
Requires-Dist: pandas; extra == "discord"
|
|
150
150
|
Provides-Extra: dropbox
|
|
151
|
-
Requires-Dist: fsspec; extra == "dropbox"
|
|
152
151
|
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
152
|
+
Requires-Dist: fsspec; extra == "dropbox"
|
|
153
153
|
Requires-Dist: numpy; extra == "dropbox"
|
|
154
154
|
Requires-Dist: pandas; extra == "dropbox"
|
|
155
155
|
Provides-Extra: duckdb
|
|
@@ -161,14 +161,14 @@ Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
|
161
161
|
Requires-Dist: numpy; extra == "elasticsearch"
|
|
162
162
|
Requires-Dist: pandas; extra == "elasticsearch"
|
|
163
163
|
Provides-Extra: gcs
|
|
164
|
-
Requires-Dist: fsspec; extra == "gcs"
|
|
165
|
-
Requires-Dist: bs4; extra == "gcs"
|
|
166
164
|
Requires-Dist: gcsfs; extra == "gcs"
|
|
165
|
+
Requires-Dist: bs4; extra == "gcs"
|
|
166
|
+
Requires-Dist: fsspec; extra == "gcs"
|
|
167
167
|
Requires-Dist: numpy; extra == "gcs"
|
|
168
168
|
Requires-Dist: pandas; extra == "gcs"
|
|
169
169
|
Provides-Extra: github
|
|
170
|
-
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
171
170
|
Requires-Dist: requests; extra == "github"
|
|
171
|
+
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
172
172
|
Requires-Dist: numpy; extra == "github"
|
|
173
173
|
Requires-Dist: pandas; extra == "github"
|
|
174
174
|
Provides-Extra: gitlab
|
|
@@ -180,15 +180,15 @@ Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
|
180
180
|
Requires-Dist: numpy; extra == "google-drive"
|
|
181
181
|
Requires-Dist: pandas; extra == "google-drive"
|
|
182
182
|
Provides-Extra: hubspot
|
|
183
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
184
183
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
184
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
185
185
|
Requires-Dist: numpy; extra == "hubspot"
|
|
186
186
|
Requires-Dist: pandas; extra == "hubspot"
|
|
187
187
|
Provides-Extra: ibm-watsonx-s3
|
|
188
|
-
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
189
|
-
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
190
188
|
Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
|
|
189
|
+
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
191
190
|
Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
|
|
191
|
+
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
192
192
|
Requires-Dist: numpy; extra == "ibm-watsonx-s3"
|
|
193
193
|
Requires-Dist: pandas; extra == "ibm-watsonx-s3"
|
|
194
194
|
Provides-Extra: jira
|
|
@@ -216,22 +216,22 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
216
216
|
Requires-Dist: numpy; extra == "mongodb"
|
|
217
217
|
Requires-Dist: pandas; extra == "mongodb"
|
|
218
218
|
Provides-Extra: neo4j
|
|
219
|
+
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
219
220
|
Requires-Dist: networkx; extra == "neo4j"
|
|
220
221
|
Requires-Dist: cymple; extra == "neo4j"
|
|
221
|
-
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
222
222
|
Requires-Dist: numpy; extra == "neo4j"
|
|
223
223
|
Requires-Dist: pandas; extra == "neo4j"
|
|
224
224
|
Provides-Extra: notion
|
|
225
|
-
Requires-Dist: htmlBuilder; extra == "notion"
|
|
226
|
-
Requires-Dist: httpx; extra == "notion"
|
|
227
225
|
Requires-Dist: notion-client; extra == "notion"
|
|
226
|
+
Requires-Dist: httpx; extra == "notion"
|
|
228
227
|
Requires-Dist: backoff; extra == "notion"
|
|
228
|
+
Requires-Dist: htmlBuilder; extra == "notion"
|
|
229
229
|
Requires-Dist: numpy; extra == "notion"
|
|
230
230
|
Requires-Dist: pandas; extra == "notion"
|
|
231
231
|
Provides-Extra: onedrive
|
|
232
|
-
Requires-Dist:
|
|
233
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
232
|
+
Requires-Dist: requests; extra == "onedrive"
|
|
234
233
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
234
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
235
235
|
Requires-Dist: numpy; extra == "onedrive"
|
|
236
236
|
Requires-Dist: pandas; extra == "onedrive"
|
|
237
237
|
Provides-Extra: opensearch
|
|
@@ -239,8 +239,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
|
239
239
|
Requires-Dist: numpy; extra == "opensearch"
|
|
240
240
|
Requires-Dist: pandas; extra == "opensearch"
|
|
241
241
|
Provides-Extra: outlook
|
|
242
|
-
Requires-Dist: msal; extra == "outlook"
|
|
243
242
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
243
|
+
Requires-Dist: msal; extra == "outlook"
|
|
244
244
|
Requires-Dist: numpy; extra == "outlook"
|
|
245
245
|
Requires-Dist: pandas; extra == "outlook"
|
|
246
246
|
Provides-Extra: pinecone
|
|
@@ -264,13 +264,14 @@ Requires-Dist: redis; extra == "redis"
|
|
|
264
264
|
Requires-Dist: numpy; extra == "redis"
|
|
265
265
|
Requires-Dist: pandas; extra == "redis"
|
|
266
266
|
Provides-Extra: s3
|
|
267
|
-
Requires-Dist: fsspec; extra == "s3"
|
|
268
267
|
Requires-Dist: s3fs; extra == "s3"
|
|
268
|
+
Requires-Dist: fsspec; extra == "s3"
|
|
269
269
|
Requires-Dist: numpy; extra == "s3"
|
|
270
270
|
Requires-Dist: pandas; extra == "s3"
|
|
271
271
|
Provides-Extra: sharepoint
|
|
272
|
-
Requires-Dist:
|
|
272
|
+
Requires-Dist: requests; extra == "sharepoint"
|
|
273
273
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
274
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
274
275
|
Requires-Dist: numpy; extra == "sharepoint"
|
|
275
276
|
Requires-Dist: pandas; extra == "sharepoint"
|
|
276
277
|
Provides-Extra: salesforce
|
|
@@ -278,8 +279,8 @@ Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
|
278
279
|
Requires-Dist: numpy; extra == "salesforce"
|
|
279
280
|
Requires-Dist: pandas; extra == "salesforce"
|
|
280
281
|
Provides-Extra: sftp
|
|
281
|
-
Requires-Dist: fsspec; extra == "sftp"
|
|
282
282
|
Requires-Dist: paramiko; extra == "sftp"
|
|
283
|
+
Requires-Dist: fsspec; extra == "sftp"
|
|
283
284
|
Requires-Dist: numpy; extra == "sftp"
|
|
284
285
|
Requires-Dist: pandas; extra == "sftp"
|
|
285
286
|
Provides-Extra: slack
|
|
@@ -312,21 +313,21 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
312
313
|
Requires-Dist: numpy; extra == "singlestore"
|
|
313
314
|
Requires-Dist: pandas; extra == "singlestore"
|
|
314
315
|
Provides-Extra: vectara
|
|
315
|
-
Requires-Dist: httpx; extra == "vectara"
|
|
316
316
|
Requires-Dist: requests; extra == "vectara"
|
|
317
|
+
Requires-Dist: httpx; extra == "vectara"
|
|
317
318
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
318
319
|
Requires-Dist: numpy; extra == "vectara"
|
|
319
320
|
Requires-Dist: pandas; extra == "vectara"
|
|
320
321
|
Provides-Extra: vastdb
|
|
321
322
|
Requires-Dist: ibis; extra == "vastdb"
|
|
322
|
-
Requires-Dist: pyarrow; extra == "vastdb"
|
|
323
323
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
324
|
+
Requires-Dist: pyarrow; extra == "vastdb"
|
|
324
325
|
Requires-Dist: numpy; extra == "vastdb"
|
|
325
326
|
Requires-Dist: pandas; extra == "vastdb"
|
|
326
327
|
Provides-Extra: zendesk
|
|
328
|
+
Requires-Dist: aiofiles; extra == "zendesk"
|
|
327
329
|
Requires-Dist: bs4; extra == "zendesk"
|
|
328
330
|
Requires-Dist: httpx; extra == "zendesk"
|
|
329
|
-
Requires-Dist: aiofiles; extra == "zendesk"
|
|
330
331
|
Requires-Dist: numpy; extra == "zendesk"
|
|
331
332
|
Requires-Dist: pandas; extra == "zendesk"
|
|
332
333
|
Provides-Extra: embed-huggingface
|
|
@@ -334,8 +335,8 @@ Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
|
334
335
|
Requires-Dist: numpy; extra == "embed-huggingface"
|
|
335
336
|
Requires-Dist: pandas; extra == "embed-huggingface"
|
|
336
337
|
Provides-Extra: embed-octoai
|
|
337
|
-
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
338
338
|
Requires-Dist: openai; extra == "embed-octoai"
|
|
339
|
+
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
339
340
|
Requires-Dist: numpy; extra == "embed-octoai"
|
|
340
341
|
Requires-Dist: pandas; extra == "embed-octoai"
|
|
341
342
|
Provides-Extra: embed-vertexai
|
|
@@ -351,13 +352,13 @@ Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
|
351
352
|
Requires-Dist: numpy; extra == "embed-mixedbreadai"
|
|
352
353
|
Requires-Dist: pandas; extra == "embed-mixedbreadai"
|
|
353
354
|
Provides-Extra: openai
|
|
354
|
-
Requires-Dist: tiktoken; extra == "openai"
|
|
355
355
|
Requires-Dist: openai; extra == "openai"
|
|
356
|
+
Requires-Dist: tiktoken; extra == "openai"
|
|
356
357
|
Requires-Dist: numpy; extra == "openai"
|
|
357
358
|
Requires-Dist: pandas; extra == "openai"
|
|
358
359
|
Provides-Extra: bedrock
|
|
359
|
-
Requires-Dist: boto3; extra == "bedrock"
|
|
360
360
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
361
|
+
Requires-Dist: boto3; extra == "bedrock"
|
|
361
362
|
Requires-Dist: numpy; extra == "bedrock"
|
|
362
363
|
Requires-Dist: pandas; extra == "bedrock"
|
|
363
364
|
Provides-Extra: togetherai
|
|
@@ -101,7 +101,6 @@ test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
101
101
|
test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
|
|
102
102
|
test/unit/test_html.py,sha256=ubsck9pVOnPDFL0P8TZkko_46MIaFLlSNQcsgFDgYoE,4496
|
|
103
103
|
test/unit/test_interfaces.py,sha256=Gv3WMJsw_3xPLy3nI3dIcJuLa2WvKYszSjI_W9XLtVM,787
|
|
104
|
-
test/unit/test_logger.py,sha256=0SKndXE_VRd8XmUHkrj7zuBQHZscXx3ZQllMEOvtF9Y,2380
|
|
105
104
|
test/unit/test_utils.py,sha256=xeSM02zOChSOO3dzDOVAEiQme1rQ8drjnJF93S3BFmk,7247
|
|
106
105
|
test/unit/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
106
|
test/unit/chunkers/test_chunkers.py,sha256=wRxbSj7P1FwRGDyVcARkm8CQSVCBCro3nTe54UoUBzc,1769
|
|
@@ -134,10 +133,10 @@ test/unit/partitioners/test_partitioner.py,sha256=eJoUDbiKtweyU1WYfsY5KqVqoPjbx1
|
|
|
134
133
|
test/unit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
135
134
|
test/unit/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
136
135
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
137
|
-
unstructured_ingest/__version__.py,sha256=
|
|
136
|
+
unstructured_ingest/__version__.py,sha256=7O8GlC09PP-XuUDOj6bhRUtbOuUgpBT2COw4AjU1kk0,42
|
|
138
137
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
139
138
|
unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
|
|
140
|
-
unstructured_ingest/logger.py,sha256=
|
|
139
|
+
unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
|
|
141
140
|
unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
|
|
142
141
|
unstructured_ingest/otel.py,sha256=NsUqOolA0gt69eFhZLABjVpcKoM9aus-AbxIKqWqPTc,4127
|
|
143
142
|
unstructured_ingest/unstructured_api.py,sha256=hWUXUhGtyfi2OcDR-BriHJyT4jJywf4zfG1qpSCf9Bo,5002
|
|
@@ -212,16 +211,18 @@ unstructured_ingest/processes/connectors/local.py,sha256=LluTLKv4g7FbJb4A6vuSxI9
|
|
|
212
211
|
unstructured_ingest/processes/connectors/milvus.py,sha256=Jr9cul7By03tGAPFnFBoqncnNWwbhKd-qbmkuqnin8U,8908
|
|
213
212
|
unstructured_ingest/processes/connectors/mongodb.py,sha256=1g_5bfbS6lah3nsOXqLAanR3zNYJ47_Njw_uV-uj3_U,14324
|
|
214
213
|
unstructured_ingest/processes/connectors/neo4j.py,sha256=eAM2XWSLA5caKJmbcd7ctn2TapreIJEXRoHoxT1OZwA,18718
|
|
215
|
-
unstructured_ingest/processes/connectors/onedrive.py,sha256=
|
|
214
|
+
unstructured_ingest/processes/connectors/onedrive.py,sha256=VBkKlbJgR7uKlKTnjNybAw6ZawLKflDPpy2uVvgWYWw,19296
|
|
216
215
|
unstructured_ingest/processes/connectors/outlook.py,sha256=FfHV9OfajGbj5VQZccqHsSyYJ0f6a4CLGQJi1s9UJjo,9294
|
|
217
216
|
unstructured_ingest/processes/connectors/pinecone.py,sha256=TG-1hVfOsKFepxPfy2MCwEVBEZF4msg8lfNQZBpo35Y,13980
|
|
218
217
|
unstructured_ingest/processes/connectors/redisdb.py,sha256=5LX6KtuNCzqjHqnJPw0zdKLE0iLx7Dk5RN9e_KT-up4,6975
|
|
219
218
|
unstructured_ingest/processes/connectors/salesforce.py,sha256=a2Erx5pXbxKIj--oJWTGk2TeOcdmipuxgleazbD62o4,11664
|
|
220
|
-
unstructured_ingest/processes/connectors/sharepoint.py,sha256=
|
|
219
|
+
unstructured_ingest/processes/connectors/sharepoint.py,sha256=PowaqMzWr-VCW1rnwcAeRhHyE55kJ9J9FCVlrmtzN0E,4827
|
|
221
220
|
unstructured_ingest/processes/connectors/slack.py,sha256=e4ntATdht_olAPsco1DKwlrOkpKLyDznPO1NJmsr0A8,9243
|
|
222
221
|
unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
|
|
223
222
|
unstructured_ingest/processes/connectors/vectara.py,sha256=frKJkc7ffstQhXD9-HkAGoQAofGkl6AsnKJhGcl8LgA,12294
|
|
224
223
|
unstructured_ingest/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
224
|
+
unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=dUZZDNkyvQXKqoAThRz3ek7zaUE2l_LAQimlG5WZhH4,211
|
|
225
|
+
unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
|
|
225
226
|
unstructured_ingest/processes/connectors/databricks/__init__.py,sha256=RtKAPyNtXh6fzEsOQ08pA0-vC1uMr3KqYG6cqiBoo70,2133
|
|
226
227
|
unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=OWQrne9-5hPzc-kxGa2P53M3DoksDzMDyjLhQyihdCo,8020
|
|
227
228
|
unstructured_ingest/processes/connectors/databricks/volumes_aws.py,sha256=RP9rq2sfysygiqzXj6eX0CXeZpxk65xmrz7HZnWRQWA,2961
|
|
@@ -360,11 +361,11 @@ unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWh
|
|
|
360
361
|
unstructured_ingest/utils/html.py,sha256=0WduP8tI5S3nHFQi6XHNPHgsIC9j3iWwyIayX9gDLiE,6386
|
|
361
362
|
unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
|
|
362
363
|
unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
|
|
363
|
-
unstructured_ingest/utils/string_and_date_utils.py,sha256=
|
|
364
|
+
unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
|
|
364
365
|
unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
|
|
365
|
-
unstructured_ingest-0.7.
|
|
366
|
-
unstructured_ingest-0.7.
|
|
367
|
-
unstructured_ingest-0.7.
|
|
368
|
-
unstructured_ingest-0.7.
|
|
369
|
-
unstructured_ingest-0.7.
|
|
370
|
-
unstructured_ingest-0.7.
|
|
366
|
+
unstructured_ingest-0.7.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
367
|
+
unstructured_ingest-0.7.2.dist-info/METADATA,sha256=BjJRt_WKMPbiOWOxGZPs3Q9ZmwHRkPfF0FbWT7X7lA4,15050
|
|
368
|
+
unstructured_ingest-0.7.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
369
|
+
unstructured_ingest-0.7.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
370
|
+
unstructured_ingest-0.7.2.dist-info/top_level.txt,sha256=85vUyT6fV2A5eCEM3M3FPRUUI9vZOVK1xVZt7eo1oV8,34
|
|
371
|
+
unstructured_ingest-0.7.2.dist-info/RECORD,,
|
test/unit/test_logger.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.logger import (
|
|
6
|
-
default_is_data_sensitive,
|
|
7
|
-
hide_sensitive_fields,
|
|
8
|
-
redact_jsons,
|
|
9
|
-
)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@pytest.mark.parametrize(
|
|
13
|
-
("key", "value", "is_sensitive"),
|
|
14
|
-
[
|
|
15
|
-
("username", "john_smith", False),
|
|
16
|
-
("password", "13?H%", True),
|
|
17
|
-
("token", "123", True),
|
|
18
|
-
("AWS_CREDENTIAL", "aws_credential", True),
|
|
19
|
-
("AWS_KEY", None, False),
|
|
20
|
-
],
|
|
21
|
-
)
|
|
22
|
-
def test_default_is_sensitive(key, value, is_sensitive):
|
|
23
|
-
assert default_is_data_sensitive(key, value) == is_sensitive
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def test_hide_sensitive_fields():
|
|
27
|
-
d = {
|
|
28
|
-
"username": "john_smith",
|
|
29
|
-
"password": "13?H%",
|
|
30
|
-
"inner": {
|
|
31
|
-
"token": "123",
|
|
32
|
-
"AWS_KEY": None,
|
|
33
|
-
"inner_j_string": json.dumps(
|
|
34
|
-
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
|
|
35
|
-
),
|
|
36
|
-
},
|
|
37
|
-
}
|
|
38
|
-
redacted_d = hide_sensitive_fields(d)
|
|
39
|
-
expected_d = {
|
|
40
|
-
"password": "*******",
|
|
41
|
-
"username": "john_smith",
|
|
42
|
-
"inner": {
|
|
43
|
-
"token": "*******",
|
|
44
|
-
"AWS_KEY": None,
|
|
45
|
-
"inner_j_string": json.dumps(
|
|
46
|
-
{"account_name": "*******", "client_id": "*******", "timestamp": 123}
|
|
47
|
-
),
|
|
48
|
-
},
|
|
49
|
-
}
|
|
50
|
-
assert redacted_d == expected_d
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def test_redact_jsons():
|
|
54
|
-
d1 = {
|
|
55
|
-
"username": "john_smith",
|
|
56
|
-
"password": "13?H%",
|
|
57
|
-
"inner": {
|
|
58
|
-
"token": "123",
|
|
59
|
-
"AWS_KEY": None,
|
|
60
|
-
"inner_j_string": json.dumps(
|
|
61
|
-
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
|
|
62
|
-
),
|
|
63
|
-
},
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
d2 = {"username": "tim67", "update_time": 456}
|
|
67
|
-
d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
|
|
68
|
-
|
|
69
|
-
sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
|
|
70
|
-
expected_string = (
|
|
71
|
-
'Some topic secret info ({"username": "john_smith", "password": "*******", '
|
|
72
|
-
'"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
|
|
73
|
-
'"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
|
|
74
|
-
'\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
|
|
75
|
-
'and {"account_name": "*******", "host": "http://localhost:8888"})'
|
|
76
|
-
)
|
|
77
|
-
redacted_string = redact_jsons(sensitive_string)
|
|
78
|
-
assert redacted_string == expected_string
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.7.0.dist-info → unstructured_ingest-0.7.2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|