unstructured-ingest 0.7.1__py3-none-any.whl → 0.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/logger.py +2 -93
- unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
- unstructured_ingest/utils/string_and_date_utils.py +3 -3
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-0.7.2.dist-info}/METADATA +99 -99
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-0.7.2.dist-info}/RECORD +11 -10
- test/unit/test_logger.py +0 -78
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-0.7.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-0.7.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-0.7.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-0.7.2.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.7.
|
|
1
|
+
__version__ = "0.7.2" # pragma: no cover
|
unstructured_ingest/logger.py
CHANGED
|
@@ -1,99 +1,8 @@
|
|
|
1
|
-
import ast
|
|
2
|
-
import json
|
|
3
1
|
import logging
|
|
4
|
-
import typing as t
|
|
5
2
|
|
|
6
3
|
logger = logging.getLogger("unstructured_ingest")
|
|
7
4
|
|
|
8
5
|
|
|
9
|
-
def default_is_data_sensitive(k: str, v: t.Any) -> bool:
|
|
10
|
-
sensitive_fields = [
|
|
11
|
-
"account_name",
|
|
12
|
-
"client_id",
|
|
13
|
-
]
|
|
14
|
-
sensitive_triggers = ["key", "cred", "token", "password", "oauth", "secret"]
|
|
15
|
-
return (
|
|
16
|
-
v
|
|
17
|
-
and any([s in k.lower() for s in sensitive_triggers]) # noqa: C419
|
|
18
|
-
or k.lower() in sensitive_fields
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def hide_sensitive_fields(
|
|
23
|
-
data: dict, is_sensitive_fn: t.Callable[[str, t.Any], bool] = default_is_data_sensitive
|
|
24
|
-
) -> dict:
|
|
25
|
-
"""
|
|
26
|
-
Will recursively look through every k, v pair in this dict and any nested ones and run
|
|
27
|
-
is_sensitive_fn to dynamically redact the value of the k, v pair. Will also check if
|
|
28
|
-
any string value can be parsed as valid json and process that dict as well and replace
|
|
29
|
-
the original string with the json.dumps() version of the redacted dict.
|
|
30
|
-
"""
|
|
31
|
-
new_data = data.copy()
|
|
32
|
-
for k, v in new_data.items():
|
|
33
|
-
if is_sensitive_fn(k, v):
|
|
34
|
-
new_data[k] = "*******"
|
|
35
|
-
if isinstance(v, dict):
|
|
36
|
-
new_data[k] = hide_sensitive_fields(v)
|
|
37
|
-
if isinstance(v, str):
|
|
38
|
-
# Need to take into account strings generated via json.dumps() or simply printing a dict
|
|
39
|
-
try:
|
|
40
|
-
json_data = json.loads(v)
|
|
41
|
-
if isinstance(json_data, dict):
|
|
42
|
-
updated_data = hide_sensitive_fields(json_data)
|
|
43
|
-
new_data[k] = json.dumps(updated_data)
|
|
44
|
-
except json.JSONDecodeError:
|
|
45
|
-
pass
|
|
46
|
-
|
|
47
|
-
return new_data
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def redact_jsons(s: str) -> str:
|
|
51
|
-
"""
|
|
52
|
-
Takes in a generic string and pulls out all valid json content. Leverages
|
|
53
|
-
hide_sensitive_fields() to redact any sensitive information and replaces the
|
|
54
|
-
original json with the new redacted format. There can be any number of valid
|
|
55
|
-
jsons in a generic string and this will work. Having extra '{' without a
|
|
56
|
-
closing '}' will cause this to break though. i.e '{ text, {"a": 3}'.
|
|
57
|
-
|
|
58
|
-
"""
|
|
59
|
-
chars = list(s)
|
|
60
|
-
if "{" not in chars:
|
|
61
|
-
return s
|
|
62
|
-
i = 0
|
|
63
|
-
jsons = []
|
|
64
|
-
i = 0
|
|
65
|
-
while i < len(chars):
|
|
66
|
-
char = chars[i]
|
|
67
|
-
if char == "{":
|
|
68
|
-
stack = [char]
|
|
69
|
-
current = [char]
|
|
70
|
-
while len(stack) != 0 and i < len(chars):
|
|
71
|
-
i += 1
|
|
72
|
-
char = chars[i]
|
|
73
|
-
current.append(char)
|
|
74
|
-
if char == "{":
|
|
75
|
-
stack.append(char)
|
|
76
|
-
if char == "}":
|
|
77
|
-
stack.pop(-1)
|
|
78
|
-
jsons.append("".join(current))
|
|
79
|
-
continue
|
|
80
|
-
i += 1
|
|
81
|
-
for j in jsons:
|
|
82
|
-
try:
|
|
83
|
-
formatted_j = json.dumps(json.loads(j))
|
|
84
|
-
except json.JSONDecodeError:
|
|
85
|
-
formatted_j = json.dumps(ast.literal_eval(j))
|
|
86
|
-
hidden_j = json.dumps(hide_sensitive_fields(json.loads(formatted_j)))
|
|
87
|
-
s = s.replace(j, hidden_j)
|
|
88
|
-
return s
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
class SensitiveFormatter(logging.Formatter):
|
|
92
|
-
def format(self, record):
|
|
93
|
-
s = super().format(record=record)
|
|
94
|
-
return redact_jsons(s)
|
|
95
|
-
|
|
96
|
-
|
|
97
6
|
def remove_root_handlers(logger: logging.Logger) -> None:
|
|
98
7
|
# NOTE(robinson): in some environments such as Google Colab, there is a root handler
|
|
99
8
|
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
@@ -106,7 +15,7 @@ def remove_root_handlers(logger: logging.Logger) -> None:
|
|
|
106
15
|
def ingest_log_streaming_init(level: int) -> None:
|
|
107
16
|
handler = logging.StreamHandler()
|
|
108
17
|
handler.name = "ingest_log_handler"
|
|
109
|
-
formatter =
|
|
18
|
+
formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
110
19
|
handler.setFormatter(formatter)
|
|
111
20
|
|
|
112
21
|
# Only want to add the handler once
|
|
@@ -122,7 +31,7 @@ def make_default_logger(level: int) -> logging.Logger:
|
|
|
122
31
|
logger = logging.getLogger("unstructured_ingest")
|
|
123
32
|
handler = logging.StreamHandler()
|
|
124
33
|
handler.name = "ingest_log_handler"
|
|
125
|
-
formatter =
|
|
34
|
+
formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
126
35
|
handler.setFormatter(formatter)
|
|
127
36
|
logger.addHandler(handler)
|
|
128
37
|
logger.setLevel(level)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"properties": [
|
|
3
|
+
{
|
|
4
|
+
"dataType": [
|
|
5
|
+
"text"
|
|
6
|
+
],
|
|
7
|
+
"indexFilterable": true,
|
|
8
|
+
"indexSearchable": true,
|
|
9
|
+
"name": "record_id",
|
|
10
|
+
"tokenization": "word"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"dataType": [
|
|
14
|
+
"text"
|
|
15
|
+
],
|
|
16
|
+
"indexFilterable": true,
|
|
17
|
+
"indexSearchable": true,
|
|
18
|
+
"name": "text",
|
|
19
|
+
"tokenization": "word"
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"vectorizer": "none"
|
|
23
|
+
}
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import re
|
|
3
|
-
import typing as t
|
|
4
3
|
from datetime import datetime
|
|
4
|
+
from typing import Any, Union
|
|
5
5
|
|
|
6
6
|
from dateutil import parser
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.logger import logger
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def json_to_dict(json_string: str) ->
|
|
11
|
+
def json_to_dict(json_string: str) -> Union[str, dict[str, Any]]:
|
|
12
12
|
"""Helper function attempts to deserialize json string to a dictionary."""
|
|
13
13
|
try:
|
|
14
14
|
return json.loads(json_string)
|
|
@@ -24,7 +24,7 @@ def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
|
|
|
24
24
|
return json_string
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
def ensure_isoformat_datetime(timestamp:
|
|
27
|
+
def ensure_isoformat_datetime(timestamp: Union[datetime, str]) -> str:
|
|
28
28
|
"""
|
|
29
29
|
Ensures that the input value is converted to an ISO format datetime string.
|
|
30
30
|
Handles both datetime objects and strings.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.2
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,349 +22,349 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
+
Requires-Dist: tqdm
|
|
25
26
|
Requires-Dist: opentelemetry-sdk
|
|
26
|
-
Requires-Dist: dataclasses_json
|
|
27
27
|
Requires-Dist: click
|
|
28
|
-
Requires-Dist: tqdm
|
|
29
|
-
Requires-Dist: pydantic>=2.7
|
|
30
28
|
Requires-Dist: python-dateutil
|
|
31
|
-
Requires-Dist:
|
|
29
|
+
Requires-Dist: pydantic>=2.7
|
|
30
|
+
Requires-Dist: dataclasses_json
|
|
32
31
|
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: pandas
|
|
33
33
|
Provides-Extra: remote
|
|
34
34
|
Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
|
|
35
|
-
Requires-Dist: pandas; extra == "remote"
|
|
36
35
|
Requires-Dist: numpy; extra == "remote"
|
|
36
|
+
Requires-Dist: pandas; extra == "remote"
|
|
37
37
|
Provides-Extra: csv
|
|
38
38
|
Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
39
|
-
Requires-Dist: pandas; extra == "csv"
|
|
40
39
|
Requires-Dist: numpy; extra == "csv"
|
|
40
|
+
Requires-Dist: pandas; extra == "csv"
|
|
41
41
|
Provides-Extra: doc
|
|
42
42
|
Requires-Dist: unstructured[docx]; extra == "doc"
|
|
43
|
-
Requires-Dist: pandas; extra == "doc"
|
|
44
43
|
Requires-Dist: numpy; extra == "doc"
|
|
44
|
+
Requires-Dist: pandas; extra == "doc"
|
|
45
45
|
Provides-Extra: docx
|
|
46
46
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
47
|
-
Requires-Dist: pandas; extra == "docx"
|
|
48
47
|
Requires-Dist: numpy; extra == "docx"
|
|
48
|
+
Requires-Dist: pandas; extra == "docx"
|
|
49
49
|
Provides-Extra: epub
|
|
50
50
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
51
|
-
Requires-Dist: pandas; extra == "epub"
|
|
52
51
|
Requires-Dist: numpy; extra == "epub"
|
|
52
|
+
Requires-Dist: pandas; extra == "epub"
|
|
53
53
|
Provides-Extra: md
|
|
54
54
|
Requires-Dist: unstructured[md]; extra == "md"
|
|
55
|
-
Requires-Dist: pandas; extra == "md"
|
|
56
55
|
Requires-Dist: numpy; extra == "md"
|
|
56
|
+
Requires-Dist: pandas; extra == "md"
|
|
57
57
|
Provides-Extra: msg
|
|
58
58
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
59
|
-
Requires-Dist: pandas; extra == "msg"
|
|
60
59
|
Requires-Dist: numpy; extra == "msg"
|
|
60
|
+
Requires-Dist: pandas; extra == "msg"
|
|
61
61
|
Provides-Extra: odt
|
|
62
62
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
63
|
-
Requires-Dist: pandas; extra == "odt"
|
|
64
63
|
Requires-Dist: numpy; extra == "odt"
|
|
64
|
+
Requires-Dist: pandas; extra == "odt"
|
|
65
65
|
Provides-Extra: org
|
|
66
66
|
Requires-Dist: unstructured[org]; extra == "org"
|
|
67
|
-
Requires-Dist: pandas; extra == "org"
|
|
68
67
|
Requires-Dist: numpy; extra == "org"
|
|
68
|
+
Requires-Dist: pandas; extra == "org"
|
|
69
69
|
Provides-Extra: pdf
|
|
70
70
|
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
71
|
-
Requires-Dist: pandas; extra == "pdf"
|
|
72
71
|
Requires-Dist: numpy; extra == "pdf"
|
|
72
|
+
Requires-Dist: pandas; extra == "pdf"
|
|
73
73
|
Provides-Extra: ppt
|
|
74
74
|
Requires-Dist: unstructured[pptx]; extra == "ppt"
|
|
75
|
-
Requires-Dist: pandas; extra == "ppt"
|
|
76
75
|
Requires-Dist: numpy; extra == "ppt"
|
|
76
|
+
Requires-Dist: pandas; extra == "ppt"
|
|
77
77
|
Provides-Extra: pptx
|
|
78
78
|
Requires-Dist: unstructured[pptx]; extra == "pptx"
|
|
79
|
-
Requires-Dist: pandas; extra == "pptx"
|
|
80
79
|
Requires-Dist: numpy; extra == "pptx"
|
|
80
|
+
Requires-Dist: pandas; extra == "pptx"
|
|
81
81
|
Provides-Extra: rtf
|
|
82
82
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
83
|
-
Requires-Dist: pandas; extra == "rtf"
|
|
84
83
|
Requires-Dist: numpy; extra == "rtf"
|
|
84
|
+
Requires-Dist: pandas; extra == "rtf"
|
|
85
85
|
Provides-Extra: rst
|
|
86
86
|
Requires-Dist: unstructured[rst]; extra == "rst"
|
|
87
|
-
Requires-Dist: pandas; extra == "rst"
|
|
88
87
|
Requires-Dist: numpy; extra == "rst"
|
|
88
|
+
Requires-Dist: pandas; extra == "rst"
|
|
89
89
|
Provides-Extra: tsv
|
|
90
90
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
91
|
-
Requires-Dist: pandas; extra == "tsv"
|
|
92
91
|
Requires-Dist: numpy; extra == "tsv"
|
|
92
|
+
Requires-Dist: pandas; extra == "tsv"
|
|
93
93
|
Provides-Extra: xlsx
|
|
94
94
|
Requires-Dist: unstructured[xlsx]; extra == "xlsx"
|
|
95
|
-
Requires-Dist: pandas; extra == "xlsx"
|
|
96
95
|
Requires-Dist: numpy; extra == "xlsx"
|
|
96
|
+
Requires-Dist: pandas; extra == "xlsx"
|
|
97
97
|
Provides-Extra: airtable
|
|
98
98
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
99
|
-
Requires-Dist: pandas; extra == "airtable"
|
|
100
99
|
Requires-Dist: numpy; extra == "airtable"
|
|
100
|
+
Requires-Dist: pandas; extra == "airtable"
|
|
101
101
|
Provides-Extra: astradb
|
|
102
102
|
Requires-Dist: astrapy; extra == "astradb"
|
|
103
|
-
Requires-Dist: pandas; extra == "astradb"
|
|
104
103
|
Requires-Dist: numpy; extra == "astradb"
|
|
104
|
+
Requires-Dist: pandas; extra == "astradb"
|
|
105
105
|
Provides-Extra: azure
|
|
106
|
-
Requires-Dist: fsspec; extra == "azure"
|
|
107
106
|
Requires-Dist: adlfs; extra == "azure"
|
|
108
|
-
Requires-Dist:
|
|
107
|
+
Requires-Dist: fsspec; extra == "azure"
|
|
109
108
|
Requires-Dist: numpy; extra == "azure"
|
|
109
|
+
Requires-Dist: pandas; extra == "azure"
|
|
110
110
|
Provides-Extra: azure-ai-search
|
|
111
111
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
112
|
-
Requires-Dist: pandas; extra == "azure-ai-search"
|
|
113
112
|
Requires-Dist: numpy; extra == "azure-ai-search"
|
|
113
|
+
Requires-Dist: pandas; extra == "azure-ai-search"
|
|
114
114
|
Provides-Extra: biomed
|
|
115
|
-
Requires-Dist: bs4; extra == "biomed"
|
|
116
115
|
Requires-Dist: requests; extra == "biomed"
|
|
117
|
-
Requires-Dist:
|
|
116
|
+
Requires-Dist: bs4; extra == "biomed"
|
|
118
117
|
Requires-Dist: numpy; extra == "biomed"
|
|
118
|
+
Requires-Dist: pandas; extra == "biomed"
|
|
119
119
|
Provides-Extra: box
|
|
120
|
-
Requires-Dist: fsspec; extra == "box"
|
|
121
120
|
Requires-Dist: boxfs; extra == "box"
|
|
122
|
-
Requires-Dist:
|
|
121
|
+
Requires-Dist: fsspec; extra == "box"
|
|
123
122
|
Requires-Dist: numpy; extra == "box"
|
|
123
|
+
Requires-Dist: pandas; extra == "box"
|
|
124
124
|
Provides-Extra: chroma
|
|
125
125
|
Requires-Dist: chromadb; extra == "chroma"
|
|
126
|
-
Requires-Dist: pandas; extra == "chroma"
|
|
127
126
|
Requires-Dist: numpy; extra == "chroma"
|
|
127
|
+
Requires-Dist: pandas; extra == "chroma"
|
|
128
128
|
Provides-Extra: clarifai
|
|
129
129
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
130
|
-
Requires-Dist: pandas; extra == "clarifai"
|
|
131
130
|
Requires-Dist: numpy; extra == "clarifai"
|
|
131
|
+
Requires-Dist: pandas; extra == "clarifai"
|
|
132
132
|
Provides-Extra: confluence
|
|
133
|
-
Requires-Dist: requests; extra == "confluence"
|
|
134
133
|
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
135
|
-
Requires-Dist:
|
|
134
|
+
Requires-Dist: requests; extra == "confluence"
|
|
136
135
|
Requires-Dist: numpy; extra == "confluence"
|
|
136
|
+
Requires-Dist: pandas; extra == "confluence"
|
|
137
137
|
Provides-Extra: couchbase
|
|
138
138
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
139
|
-
Requires-Dist: pandas; extra == "couchbase"
|
|
140
139
|
Requires-Dist: numpy; extra == "couchbase"
|
|
140
|
+
Requires-Dist: pandas; extra == "couchbase"
|
|
141
141
|
Provides-Extra: delta-table
|
|
142
142
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
143
143
|
Requires-Dist: boto3; extra == "delta-table"
|
|
144
|
-
Requires-Dist: pandas; extra == "delta-table"
|
|
145
144
|
Requires-Dist: numpy; extra == "delta-table"
|
|
145
|
+
Requires-Dist: pandas; extra == "delta-table"
|
|
146
146
|
Provides-Extra: discord
|
|
147
147
|
Requires-Dist: discord.py; extra == "discord"
|
|
148
|
-
Requires-Dist: pandas; extra == "discord"
|
|
149
148
|
Requires-Dist: numpy; extra == "discord"
|
|
149
|
+
Requires-Dist: pandas; extra == "discord"
|
|
150
150
|
Provides-Extra: dropbox
|
|
151
151
|
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
152
152
|
Requires-Dist: fsspec; extra == "dropbox"
|
|
153
|
-
Requires-Dist: pandas; extra == "dropbox"
|
|
154
153
|
Requires-Dist: numpy; extra == "dropbox"
|
|
154
|
+
Requires-Dist: pandas; extra == "dropbox"
|
|
155
155
|
Provides-Extra: duckdb
|
|
156
156
|
Requires-Dist: duckdb; extra == "duckdb"
|
|
157
|
-
Requires-Dist: pandas; extra == "duckdb"
|
|
158
157
|
Requires-Dist: numpy; extra == "duckdb"
|
|
158
|
+
Requires-Dist: pandas; extra == "duckdb"
|
|
159
159
|
Provides-Extra: elasticsearch
|
|
160
160
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
161
|
-
Requires-Dist: pandas; extra == "elasticsearch"
|
|
162
161
|
Requires-Dist: numpy; extra == "elasticsearch"
|
|
162
|
+
Requires-Dist: pandas; extra == "elasticsearch"
|
|
163
163
|
Provides-Extra: gcs
|
|
164
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
164
165
|
Requires-Dist: bs4; extra == "gcs"
|
|
165
166
|
Requires-Dist: fsspec; extra == "gcs"
|
|
166
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
167
|
-
Requires-Dist: pandas; extra == "gcs"
|
|
168
167
|
Requires-Dist: numpy; extra == "gcs"
|
|
168
|
+
Requires-Dist: pandas; extra == "gcs"
|
|
169
169
|
Provides-Extra: github
|
|
170
170
|
Requires-Dist: requests; extra == "github"
|
|
171
171
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
172
|
-
Requires-Dist: pandas; extra == "github"
|
|
173
172
|
Requires-Dist: numpy; extra == "github"
|
|
173
|
+
Requires-Dist: pandas; extra == "github"
|
|
174
174
|
Provides-Extra: gitlab
|
|
175
175
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
176
|
-
Requires-Dist: pandas; extra == "gitlab"
|
|
177
176
|
Requires-Dist: numpy; extra == "gitlab"
|
|
177
|
+
Requires-Dist: pandas; extra == "gitlab"
|
|
178
178
|
Provides-Extra: google-drive
|
|
179
179
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
180
|
-
Requires-Dist: pandas; extra == "google-drive"
|
|
181
180
|
Requires-Dist: numpy; extra == "google-drive"
|
|
181
|
+
Requires-Dist: pandas; extra == "google-drive"
|
|
182
182
|
Provides-Extra: hubspot
|
|
183
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
184
183
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
185
|
-
Requires-Dist:
|
|
184
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
186
185
|
Requires-Dist: numpy; extra == "hubspot"
|
|
186
|
+
Requires-Dist: pandas; extra == "hubspot"
|
|
187
187
|
Provides-Extra: ibm-watsonx-s3
|
|
188
188
|
Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
|
|
189
189
|
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
190
|
-
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
191
190
|
Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
|
|
192
|
-
Requires-Dist:
|
|
191
|
+
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
193
192
|
Requires-Dist: numpy; extra == "ibm-watsonx-s3"
|
|
193
|
+
Requires-Dist: pandas; extra == "ibm-watsonx-s3"
|
|
194
194
|
Provides-Extra: jira
|
|
195
195
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
196
|
-
Requires-Dist: pandas; extra == "jira"
|
|
197
196
|
Requires-Dist: numpy; extra == "jira"
|
|
197
|
+
Requires-Dist: pandas; extra == "jira"
|
|
198
198
|
Provides-Extra: kafka
|
|
199
199
|
Requires-Dist: confluent-kafka; extra == "kafka"
|
|
200
|
-
Requires-Dist: pandas; extra == "kafka"
|
|
201
200
|
Requires-Dist: numpy; extra == "kafka"
|
|
201
|
+
Requires-Dist: pandas; extra == "kafka"
|
|
202
202
|
Provides-Extra: kdbai
|
|
203
203
|
Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
|
|
204
|
-
Requires-Dist: pandas; extra == "kdbai"
|
|
205
204
|
Requires-Dist: numpy; extra == "kdbai"
|
|
205
|
+
Requires-Dist: pandas; extra == "kdbai"
|
|
206
206
|
Provides-Extra: lancedb
|
|
207
207
|
Requires-Dist: lancedb; extra == "lancedb"
|
|
208
|
-
Requires-Dist: pandas; extra == "lancedb"
|
|
209
208
|
Requires-Dist: numpy; extra == "lancedb"
|
|
209
|
+
Requires-Dist: pandas; extra == "lancedb"
|
|
210
210
|
Provides-Extra: milvus
|
|
211
211
|
Requires-Dist: pymilvus; extra == "milvus"
|
|
212
|
-
Requires-Dist: pandas; extra == "milvus"
|
|
213
212
|
Requires-Dist: numpy; extra == "milvus"
|
|
213
|
+
Requires-Dist: pandas; extra == "milvus"
|
|
214
214
|
Provides-Extra: mongodb
|
|
215
215
|
Requires-Dist: pymongo; extra == "mongodb"
|
|
216
|
-
Requires-Dist: pandas; extra == "mongodb"
|
|
217
216
|
Requires-Dist: numpy; extra == "mongodb"
|
|
217
|
+
Requires-Dist: pandas; extra == "mongodb"
|
|
218
218
|
Provides-Extra: neo4j
|
|
219
|
-
Requires-Dist: cymple; extra == "neo4j"
|
|
220
219
|
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
221
220
|
Requires-Dist: networkx; extra == "neo4j"
|
|
222
|
-
Requires-Dist:
|
|
221
|
+
Requires-Dist: cymple; extra == "neo4j"
|
|
223
222
|
Requires-Dist: numpy; extra == "neo4j"
|
|
223
|
+
Requires-Dist: pandas; extra == "neo4j"
|
|
224
224
|
Provides-Extra: notion
|
|
225
225
|
Requires-Dist: notion-client; extra == "notion"
|
|
226
|
-
Requires-Dist: backoff; extra == "notion"
|
|
227
226
|
Requires-Dist: httpx; extra == "notion"
|
|
227
|
+
Requires-Dist: backoff; extra == "notion"
|
|
228
228
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
229
|
-
Requires-Dist: pandas; extra == "notion"
|
|
230
229
|
Requires-Dist: numpy; extra == "notion"
|
|
230
|
+
Requires-Dist: pandas; extra == "notion"
|
|
231
231
|
Provides-Extra: onedrive
|
|
232
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
233
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
234
232
|
Requires-Dist: requests; extra == "onedrive"
|
|
235
|
-
Requires-Dist:
|
|
233
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
234
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
236
235
|
Requires-Dist: numpy; extra == "onedrive"
|
|
236
|
+
Requires-Dist: pandas; extra == "onedrive"
|
|
237
237
|
Provides-Extra: opensearch
|
|
238
238
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
239
|
-
Requires-Dist: pandas; extra == "opensearch"
|
|
240
239
|
Requires-Dist: numpy; extra == "opensearch"
|
|
240
|
+
Requires-Dist: pandas; extra == "opensearch"
|
|
241
241
|
Provides-Extra: outlook
|
|
242
|
-
Requires-Dist: msal; extra == "outlook"
|
|
243
242
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
244
|
-
Requires-Dist:
|
|
243
|
+
Requires-Dist: msal; extra == "outlook"
|
|
245
244
|
Requires-Dist: numpy; extra == "outlook"
|
|
245
|
+
Requires-Dist: pandas; extra == "outlook"
|
|
246
246
|
Provides-Extra: pinecone
|
|
247
247
|
Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
|
|
248
|
-
Requires-Dist: pandas; extra == "pinecone"
|
|
249
248
|
Requires-Dist: numpy; extra == "pinecone"
|
|
249
|
+
Requires-Dist: pandas; extra == "pinecone"
|
|
250
250
|
Provides-Extra: postgres
|
|
251
251
|
Requires-Dist: psycopg2-binary; extra == "postgres"
|
|
252
|
-
Requires-Dist: pandas; extra == "postgres"
|
|
253
252
|
Requires-Dist: numpy; extra == "postgres"
|
|
253
|
+
Requires-Dist: pandas; extra == "postgres"
|
|
254
254
|
Provides-Extra: qdrant
|
|
255
255
|
Requires-Dist: qdrant-client; extra == "qdrant"
|
|
256
|
-
Requires-Dist: pandas; extra == "qdrant"
|
|
257
256
|
Requires-Dist: numpy; extra == "qdrant"
|
|
257
|
+
Requires-Dist: pandas; extra == "qdrant"
|
|
258
258
|
Provides-Extra: reddit
|
|
259
259
|
Requires-Dist: praw; extra == "reddit"
|
|
260
|
-
Requires-Dist: pandas; extra == "reddit"
|
|
261
260
|
Requires-Dist: numpy; extra == "reddit"
|
|
261
|
+
Requires-Dist: pandas; extra == "reddit"
|
|
262
262
|
Provides-Extra: redis
|
|
263
263
|
Requires-Dist: redis; extra == "redis"
|
|
264
|
-
Requires-Dist: pandas; extra == "redis"
|
|
265
264
|
Requires-Dist: numpy; extra == "redis"
|
|
265
|
+
Requires-Dist: pandas; extra == "redis"
|
|
266
266
|
Provides-Extra: s3
|
|
267
|
-
Requires-Dist: fsspec; extra == "s3"
|
|
268
267
|
Requires-Dist: s3fs; extra == "s3"
|
|
269
|
-
Requires-Dist:
|
|
268
|
+
Requires-Dist: fsspec; extra == "s3"
|
|
270
269
|
Requires-Dist: numpy; extra == "s3"
|
|
270
|
+
Requires-Dist: pandas; extra == "s3"
|
|
271
271
|
Provides-Extra: sharepoint
|
|
272
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
273
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
274
272
|
Requires-Dist: requests; extra == "sharepoint"
|
|
275
|
-
Requires-Dist:
|
|
273
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
274
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
276
275
|
Requires-Dist: numpy; extra == "sharepoint"
|
|
276
|
+
Requires-Dist: pandas; extra == "sharepoint"
|
|
277
277
|
Provides-Extra: salesforce
|
|
278
278
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
279
|
-
Requires-Dist: pandas; extra == "salesforce"
|
|
280
279
|
Requires-Dist: numpy; extra == "salesforce"
|
|
280
|
+
Requires-Dist: pandas; extra == "salesforce"
|
|
281
281
|
Provides-Extra: sftp
|
|
282
282
|
Requires-Dist: paramiko; extra == "sftp"
|
|
283
283
|
Requires-Dist: fsspec; extra == "sftp"
|
|
284
|
-
Requires-Dist: pandas; extra == "sftp"
|
|
285
284
|
Requires-Dist: numpy; extra == "sftp"
|
|
285
|
+
Requires-Dist: pandas; extra == "sftp"
|
|
286
286
|
Provides-Extra: slack
|
|
287
287
|
Requires-Dist: slack_sdk[optional]; extra == "slack"
|
|
288
|
-
Requires-Dist: pandas; extra == "slack"
|
|
289
288
|
Requires-Dist: numpy; extra == "slack"
|
|
289
|
+
Requires-Dist: pandas; extra == "slack"
|
|
290
290
|
Provides-Extra: snowflake
|
|
291
291
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
292
292
|
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
293
|
-
Requires-Dist: pandas; extra == "snowflake"
|
|
294
293
|
Requires-Dist: numpy; extra == "snowflake"
|
|
294
|
+
Requires-Dist: pandas; extra == "snowflake"
|
|
295
295
|
Provides-Extra: wikipedia
|
|
296
296
|
Requires-Dist: wikipedia; extra == "wikipedia"
|
|
297
|
-
Requires-Dist: pandas; extra == "wikipedia"
|
|
298
297
|
Requires-Dist: numpy; extra == "wikipedia"
|
|
298
|
+
Requires-Dist: pandas; extra == "wikipedia"
|
|
299
299
|
Provides-Extra: weaviate
|
|
300
300
|
Requires-Dist: weaviate-client; extra == "weaviate"
|
|
301
|
-
Requires-Dist: pandas; extra == "weaviate"
|
|
302
301
|
Requires-Dist: numpy; extra == "weaviate"
|
|
302
|
+
Requires-Dist: pandas; extra == "weaviate"
|
|
303
303
|
Provides-Extra: databricks-volumes
|
|
304
304
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
305
|
-
Requires-Dist: pandas; extra == "databricks-volumes"
|
|
306
305
|
Requires-Dist: numpy; extra == "databricks-volumes"
|
|
306
|
+
Requires-Dist: pandas; extra == "databricks-volumes"
|
|
307
307
|
Provides-Extra: databricks-delta-tables
|
|
308
308
|
Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
309
|
-
Requires-Dist: pandas; extra == "databricks-delta-tables"
|
|
310
309
|
Requires-Dist: numpy; extra == "databricks-delta-tables"
|
|
310
|
+
Requires-Dist: pandas; extra == "databricks-delta-tables"
|
|
311
311
|
Provides-Extra: singlestore
|
|
312
312
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
313
|
-
Requires-Dist: pandas; extra == "singlestore"
|
|
314
313
|
Requires-Dist: numpy; extra == "singlestore"
|
|
314
|
+
Requires-Dist: pandas; extra == "singlestore"
|
|
315
315
|
Provides-Extra: vectara
|
|
316
316
|
Requires-Dist: requests; extra == "vectara"
|
|
317
317
|
Requires-Dist: httpx; extra == "vectara"
|
|
318
318
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
319
|
-
Requires-Dist: pandas; extra == "vectara"
|
|
320
319
|
Requires-Dist: numpy; extra == "vectara"
|
|
320
|
+
Requires-Dist: pandas; extra == "vectara"
|
|
321
321
|
Provides-Extra: vastdb
|
|
322
322
|
Requires-Dist: ibis; extra == "vastdb"
|
|
323
323
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
324
324
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
325
|
-
Requires-Dist: pandas; extra == "vastdb"
|
|
326
325
|
Requires-Dist: numpy; extra == "vastdb"
|
|
326
|
+
Requires-Dist: pandas; extra == "vastdb"
|
|
327
327
|
Provides-Extra: zendesk
|
|
328
|
+
Requires-Dist: aiofiles; extra == "zendesk"
|
|
328
329
|
Requires-Dist: bs4; extra == "zendesk"
|
|
329
330
|
Requires-Dist: httpx; extra == "zendesk"
|
|
330
|
-
Requires-Dist: aiofiles; extra == "zendesk"
|
|
331
|
-
Requires-Dist: pandas; extra == "zendesk"
|
|
332
331
|
Requires-Dist: numpy; extra == "zendesk"
|
|
332
|
+
Requires-Dist: pandas; extra == "zendesk"
|
|
333
333
|
Provides-Extra: embed-huggingface
|
|
334
334
|
Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
335
|
-
Requires-Dist: pandas; extra == "embed-huggingface"
|
|
336
335
|
Requires-Dist: numpy; extra == "embed-huggingface"
|
|
336
|
+
Requires-Dist: pandas; extra == "embed-huggingface"
|
|
337
337
|
Provides-Extra: embed-octoai
|
|
338
|
-
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
339
338
|
Requires-Dist: openai; extra == "embed-octoai"
|
|
340
|
-
Requires-Dist:
|
|
339
|
+
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
341
340
|
Requires-Dist: numpy; extra == "embed-octoai"
|
|
341
|
+
Requires-Dist: pandas; extra == "embed-octoai"
|
|
342
342
|
Provides-Extra: embed-vertexai
|
|
343
343
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
344
|
-
Requires-Dist: pandas; extra == "embed-vertexai"
|
|
345
344
|
Requires-Dist: numpy; extra == "embed-vertexai"
|
|
345
|
+
Requires-Dist: pandas; extra == "embed-vertexai"
|
|
346
346
|
Provides-Extra: embed-voyageai
|
|
347
347
|
Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
348
|
-
Requires-Dist: pandas; extra == "embed-voyageai"
|
|
349
348
|
Requires-Dist: numpy; extra == "embed-voyageai"
|
|
349
|
+
Requires-Dist: pandas; extra == "embed-voyageai"
|
|
350
350
|
Provides-Extra: embed-mixedbreadai
|
|
351
351
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
352
|
-
Requires-Dist: pandas; extra == "embed-mixedbreadai"
|
|
353
352
|
Requires-Dist: numpy; extra == "embed-mixedbreadai"
|
|
353
|
+
Requires-Dist: pandas; extra == "embed-mixedbreadai"
|
|
354
354
|
Provides-Extra: openai
|
|
355
|
-
Requires-Dist: tiktoken; extra == "openai"
|
|
356
355
|
Requires-Dist: openai; extra == "openai"
|
|
357
|
-
Requires-Dist:
|
|
356
|
+
Requires-Dist: tiktoken; extra == "openai"
|
|
358
357
|
Requires-Dist: numpy; extra == "openai"
|
|
358
|
+
Requires-Dist: pandas; extra == "openai"
|
|
359
359
|
Provides-Extra: bedrock
|
|
360
|
-
Requires-Dist: boto3; extra == "bedrock"
|
|
361
360
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
362
|
-
Requires-Dist:
|
|
361
|
+
Requires-Dist: boto3; extra == "bedrock"
|
|
363
362
|
Requires-Dist: numpy; extra == "bedrock"
|
|
363
|
+
Requires-Dist: pandas; extra == "bedrock"
|
|
364
364
|
Provides-Extra: togetherai
|
|
365
365
|
Requires-Dist: together; extra == "togetherai"
|
|
366
|
-
Requires-Dist: pandas; extra == "togetherai"
|
|
367
366
|
Requires-Dist: numpy; extra == "togetherai"
|
|
367
|
+
Requires-Dist: pandas; extra == "togetherai"
|
|
368
368
|
Dynamic: author
|
|
369
369
|
Dynamic: author-email
|
|
370
370
|
Dynamic: classifier
|
|
@@ -101,7 +101,6 @@ test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
101
101
|
test/unit/test_error.py,sha256=RflmngCdFNKOLXVfLnUdNfY3Mfg3k7DTEzfIl0B-syU,840
|
|
102
102
|
test/unit/test_html.py,sha256=ubsck9pVOnPDFL0P8TZkko_46MIaFLlSNQcsgFDgYoE,4496
|
|
103
103
|
test/unit/test_interfaces.py,sha256=Gv3WMJsw_3xPLy3nI3dIcJuLa2WvKYszSjI_W9XLtVM,787
|
|
104
|
-
test/unit/test_logger.py,sha256=0SKndXE_VRd8XmUHkrj7zuBQHZscXx3ZQllMEOvtF9Y,2380
|
|
105
104
|
test/unit/test_utils.py,sha256=xeSM02zOChSOO3dzDOVAEiQme1rQ8drjnJF93S3BFmk,7247
|
|
106
105
|
test/unit/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
106
|
test/unit/chunkers/test_chunkers.py,sha256=wRxbSj7P1FwRGDyVcARkm8CQSVCBCro3nTe54UoUBzc,1769
|
|
@@ -134,10 +133,10 @@ test/unit/partitioners/test_partitioner.py,sha256=eJoUDbiKtweyU1WYfsY5KqVqoPjbx1
|
|
|
134
133
|
test/unit/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
135
134
|
test/unit/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
136
135
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
137
|
-
unstructured_ingest/__version__.py,sha256=
|
|
136
|
+
unstructured_ingest/__version__.py,sha256=7O8GlC09PP-XuUDOj6bhRUtbOuUgpBT2COw4AjU1kk0,42
|
|
138
137
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
139
138
|
unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
|
|
140
|
-
unstructured_ingest/logger.py,sha256=
|
|
139
|
+
unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
|
|
141
140
|
unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
|
|
142
141
|
unstructured_ingest/otel.py,sha256=NsUqOolA0gt69eFhZLABjVpcKoM9aus-AbxIKqWqPTc,4127
|
|
143
142
|
unstructured_ingest/unstructured_api.py,sha256=hWUXUhGtyfi2OcDR-BriHJyT4jJywf4zfG1qpSCf9Bo,5002
|
|
@@ -222,6 +221,8 @@ unstructured_ingest/processes/connectors/slack.py,sha256=e4ntATdht_olAPsco1DKwlr
|
|
|
222
221
|
unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
|
|
223
222
|
unstructured_ingest/processes/connectors/vectara.py,sha256=frKJkc7ffstQhXD9-HkAGoQAofGkl6AsnKJhGcl8LgA,12294
|
|
224
223
|
unstructured_ingest/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
224
|
+
unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=dUZZDNkyvQXKqoAThRz3ek7zaUE2l_LAQimlG5WZhH4,211
|
|
225
|
+
unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
|
|
225
226
|
unstructured_ingest/processes/connectors/databricks/__init__.py,sha256=RtKAPyNtXh6fzEsOQ08pA0-vC1uMr3KqYG6cqiBoo70,2133
|
|
226
227
|
unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=OWQrne9-5hPzc-kxGa2P53M3DoksDzMDyjLhQyihdCo,8020
|
|
227
228
|
unstructured_ingest/processes/connectors/databricks/volumes_aws.py,sha256=RP9rq2sfysygiqzXj6eX0CXeZpxk65xmrz7HZnWRQWA,2961
|
|
@@ -360,11 +361,11 @@ unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWh
|
|
|
360
361
|
unstructured_ingest/utils/html.py,sha256=0WduP8tI5S3nHFQi6XHNPHgsIC9j3iWwyIayX9gDLiE,6386
|
|
361
362
|
unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
|
|
362
363
|
unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
|
|
363
|
-
unstructured_ingest/utils/string_and_date_utils.py,sha256=
|
|
364
|
+
unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
|
|
364
365
|
unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
|
|
365
|
-
unstructured_ingest-0.7.
|
|
366
|
-
unstructured_ingest-0.7.
|
|
367
|
-
unstructured_ingest-0.7.
|
|
368
|
-
unstructured_ingest-0.7.
|
|
369
|
-
unstructured_ingest-0.7.
|
|
370
|
-
unstructured_ingest-0.7.
|
|
366
|
+
unstructured_ingest-0.7.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
367
|
+
unstructured_ingest-0.7.2.dist-info/METADATA,sha256=BjJRt_WKMPbiOWOxGZPs3Q9ZmwHRkPfF0FbWT7X7lA4,15050
|
|
368
|
+
unstructured_ingest-0.7.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
369
|
+
unstructured_ingest-0.7.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
370
|
+
unstructured_ingest-0.7.2.dist-info/top_level.txt,sha256=85vUyT6fV2A5eCEM3M3FPRUUI9vZOVK1xVZt7eo1oV8,34
|
|
371
|
+
unstructured_ingest-0.7.2.dist-info/RECORD,,
|
test/unit/test_logger.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.logger import (
|
|
6
|
-
default_is_data_sensitive,
|
|
7
|
-
hide_sensitive_fields,
|
|
8
|
-
redact_jsons,
|
|
9
|
-
)
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@pytest.mark.parametrize(
|
|
13
|
-
("key", "value", "is_sensitive"),
|
|
14
|
-
[
|
|
15
|
-
("username", "john_smith", False),
|
|
16
|
-
("password", "13?H%", True),
|
|
17
|
-
("token", "123", True),
|
|
18
|
-
("AWS_CREDENTIAL", "aws_credential", True),
|
|
19
|
-
("AWS_KEY", None, False),
|
|
20
|
-
],
|
|
21
|
-
)
|
|
22
|
-
def test_default_is_sensitive(key, value, is_sensitive):
|
|
23
|
-
assert default_is_data_sensitive(key, value) == is_sensitive
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def test_hide_sensitive_fields():
|
|
27
|
-
d = {
|
|
28
|
-
"username": "john_smith",
|
|
29
|
-
"password": "13?H%",
|
|
30
|
-
"inner": {
|
|
31
|
-
"token": "123",
|
|
32
|
-
"AWS_KEY": None,
|
|
33
|
-
"inner_j_string": json.dumps(
|
|
34
|
-
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
|
|
35
|
-
),
|
|
36
|
-
},
|
|
37
|
-
}
|
|
38
|
-
redacted_d = hide_sensitive_fields(d)
|
|
39
|
-
expected_d = {
|
|
40
|
-
"password": "*******",
|
|
41
|
-
"username": "john_smith",
|
|
42
|
-
"inner": {
|
|
43
|
-
"token": "*******",
|
|
44
|
-
"AWS_KEY": None,
|
|
45
|
-
"inner_j_string": json.dumps(
|
|
46
|
-
{"account_name": "*******", "client_id": "*******", "timestamp": 123}
|
|
47
|
-
),
|
|
48
|
-
},
|
|
49
|
-
}
|
|
50
|
-
assert redacted_d == expected_d
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
def test_redact_jsons():
|
|
54
|
-
d1 = {
|
|
55
|
-
"username": "john_smith",
|
|
56
|
-
"password": "13?H%",
|
|
57
|
-
"inner": {
|
|
58
|
-
"token": "123",
|
|
59
|
-
"AWS_KEY": None,
|
|
60
|
-
"inner_j_string": json.dumps(
|
|
61
|
-
{"account_name": "secret name", "client_id": 123, "timestamp": 123}
|
|
62
|
-
),
|
|
63
|
-
},
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
d2 = {"username": "tim67", "update_time": 456}
|
|
67
|
-
d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
|
|
68
|
-
|
|
69
|
-
sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
|
|
70
|
-
expected_string = (
|
|
71
|
-
'Some topic secret info ({"username": "john_smith", "password": "*******", '
|
|
72
|
-
'"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
|
|
73
|
-
'"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
|
|
74
|
-
'\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
|
|
75
|
-
'and {"account_name": "*******", "host": "http://localhost:8888"})'
|
|
76
|
-
)
|
|
77
|
-
redacted_string = redact_jsons(sensitive_string)
|
|
78
|
-
assert redacted_string == expected_string
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.7.1.dist-info → unstructured_ingest-0.7.2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|