unstructured-ingest 0.0.0__py3-none-any.whl → 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/connector/notion/helpers.py +1 -1
- unstructured_ingest/logger.py +2 -2
- unstructured_ingest/v2/cli/cmds/__init__.py +2 -0
- unstructured_ingest/v2/cli/cmds/milvus.py +72 -0
- unstructured_ingest/v2/logger.py +1 -1
- unstructured_ingest/v2/pipeline/interfaces.py +4 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +4 -2
- unstructured_ingest/v2/processes/connectors/milvus.py +200 -0
- unstructured_ingest/v2/processes/connectors/pinecone.py +4 -4
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.1.dist-info}/METADATA +17 -15
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.1.dist-info}/RECORD +15 -13
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.1" # pragma: no cover
|
|
@@ -5,7 +5,6 @@ from typing import List, Optional, Tuple
|
|
|
5
5
|
from urllib.parse import urlparse
|
|
6
6
|
from uuid import UUID
|
|
7
7
|
|
|
8
|
-
import unstructured.ingest.connector.notion.types.blocks as notion_blocks
|
|
9
8
|
from htmlBuilder.attributes import Style, Type
|
|
10
9
|
from htmlBuilder.tags import (
|
|
11
10
|
Body,
|
|
@@ -23,6 +22,7 @@ from htmlBuilder.tags import (
|
|
|
23
22
|
)
|
|
24
23
|
from notion_client.errors import APIResponseError
|
|
25
24
|
|
|
25
|
+
import unstructured_ingest.connector.notion.types.blocks as notion_blocks
|
|
26
26
|
from unstructured_ingest.connector.notion.client import Client
|
|
27
27
|
from unstructured_ingest.connector.notion.interfaces import BlockBase
|
|
28
28
|
from unstructured_ingest.connector.notion.types.block import Block
|
unstructured_ingest/logger.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
import typing as t
|
|
5
5
|
|
|
6
|
-
logger = logging.getLogger("
|
|
6
|
+
logger = logging.getLogger("unstructured_ingest")
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def default_is_data_sensitive(k: str, v: t.Any) -> bool:
|
|
@@ -119,7 +119,7 @@ def ingest_log_streaming_init(level: int) -> None:
|
|
|
119
119
|
|
|
120
120
|
def make_default_logger(level: int) -> logging.Logger:
|
|
121
121
|
"""Return a custom logger."""
|
|
122
|
-
logger = logging.getLogger("
|
|
122
|
+
logger = logging.getLogger("unstructured_ingest")
|
|
123
123
|
handler = logging.StreamHandler()
|
|
124
124
|
handler.name = "ingest_log_handler"
|
|
125
125
|
formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
@@ -15,6 +15,7 @@ from .fsspec.s3 import s3_dest_cmd, s3_src_cmd
|
|
|
15
15
|
from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd
|
|
16
16
|
from .google_drive import google_drive_src_cmd
|
|
17
17
|
from .local import local_dest_cmd, local_src_cmd
|
|
18
|
+
from .milvus import milvus_dest_cmd
|
|
18
19
|
from .mongodb import mongodb_dest_cmd
|
|
19
20
|
from .onedrive import onedrive_drive_src_cmd
|
|
20
21
|
from .opensearch import opensearch_dest_cmd, opensearch_src_cmd
|
|
@@ -60,6 +61,7 @@ dest_cmds = [
|
|
|
60
61
|
elasticsearch_dest_cmd,
|
|
61
62
|
gcs_dest_cmd,
|
|
62
63
|
local_dest_cmd,
|
|
64
|
+
milvus_dest_cmd,
|
|
63
65
|
opensearch_dest_cmd,
|
|
64
66
|
pinecone_dest_cmd,
|
|
65
67
|
s3_dest_cmd,
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
+
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
+
from unstructured_ingest.v2.processes.connectors.milvus import CONNECTOR_TYPE
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class MilvusCliConnectionConfig(CliConfig):
|
|
12
|
+
@staticmethod
|
|
13
|
+
def get_cli_options() -> list[click.Option]:
|
|
14
|
+
options = [
|
|
15
|
+
click.Option(
|
|
16
|
+
["--uri"],
|
|
17
|
+
required=False,
|
|
18
|
+
type=str,
|
|
19
|
+
default=None,
|
|
20
|
+
help="Milvus uri, eg 'http://localhost:19530",
|
|
21
|
+
),
|
|
22
|
+
click.Option(
|
|
23
|
+
["--user"],
|
|
24
|
+
required=False,
|
|
25
|
+
type=str,
|
|
26
|
+
default=None,
|
|
27
|
+
help="Milvus user",
|
|
28
|
+
),
|
|
29
|
+
click.Option(
|
|
30
|
+
["--password"],
|
|
31
|
+
required=False,
|
|
32
|
+
type=str,
|
|
33
|
+
default=None,
|
|
34
|
+
help="Milvus password",
|
|
35
|
+
),
|
|
36
|
+
click.Option(
|
|
37
|
+
["--db-name"],
|
|
38
|
+
required=False,
|
|
39
|
+
type=str,
|
|
40
|
+
default=None,
|
|
41
|
+
help="Milvus database name",
|
|
42
|
+
),
|
|
43
|
+
]
|
|
44
|
+
return options
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class MilvusCliUploaderConfig(CliConfig):
|
|
49
|
+
@staticmethod
|
|
50
|
+
def get_cli_options() -> list[click.Option]:
|
|
51
|
+
options = [
|
|
52
|
+
click.Option(
|
|
53
|
+
["--collection-name"],
|
|
54
|
+
required=True,
|
|
55
|
+
type=str,
|
|
56
|
+
help="Milvus collections to write to",
|
|
57
|
+
),
|
|
58
|
+
click.Option(
|
|
59
|
+
["--num-of-processes"],
|
|
60
|
+
type=click.IntRange(min=1),
|
|
61
|
+
default=4,
|
|
62
|
+
help="number of processes to use when writing to support parallel writes",
|
|
63
|
+
),
|
|
64
|
+
]
|
|
65
|
+
return options
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
milvus_dest_cmd = DestCmd(
|
|
69
|
+
cmd_name=CONNECTOR_TYPE,
|
|
70
|
+
connection_config=MilvusCliConnectionConfig,
|
|
71
|
+
uploader_config=MilvusCliUploaderConfig,
|
|
72
|
+
)
|
unstructured_ingest/v2/logger.py
CHANGED
|
@@ -5,7 +5,7 @@ from logging import Formatter, Logger, StreamHandler, getLevelName, getLogger
|
|
|
5
5
|
from typing import Any, Callable
|
|
6
6
|
|
|
7
7
|
log_level = os.getenv("INGEST_LOG_LEVEL", "INFO")
|
|
8
|
-
LOGGER_NAME = "
|
|
8
|
+
LOGGER_NAME = "unstructured_ingest.v2"
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def default_is_data_sensitive(k: str, v: Any) -> bool:
|
|
@@ -146,6 +146,8 @@ class PipelineStep(ABC):
|
|
|
146
146
|
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
147
147
|
if "file_data_path" in kwargs:
|
|
148
148
|
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
149
|
+
else:
|
|
150
|
+
self.context.status[self.identifier] = {"step_error": str(e)}
|
|
149
151
|
if self.context.raise_on_error:
|
|
150
152
|
raise e
|
|
151
153
|
return None
|
|
@@ -158,6 +160,8 @@ class PipelineStep(ABC):
|
|
|
158
160
|
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
159
161
|
if "file_data_path" in kwargs:
|
|
160
162
|
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
163
|
+
else:
|
|
164
|
+
self.context.status[self.identifier] = {"step_error": str(e)}
|
|
161
165
|
if self.context.raise_on_error:
|
|
162
166
|
raise e
|
|
163
167
|
return None
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
4
|
-
|
|
3
|
+
import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
|
|
5
4
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
6
5
|
add_destination_entry,
|
|
7
6
|
add_source_entry,
|
|
@@ -19,6 +18,8 @@ from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
|
|
|
19
18
|
from .google_drive import google_drive_source_entry
|
|
20
19
|
from .local import CONNECTOR_TYPE as LOCAL_CONNECTOR_TYPE
|
|
21
20
|
from .local import local_destination_entry, local_source_entry
|
|
21
|
+
from .milvus import CONNECTOR_TYPE as MILVUS_CONNECTOR_TYPE
|
|
22
|
+
from .milvus import milvus_destination_entry
|
|
22
23
|
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
|
|
23
24
|
from .mongodb import mongodb_destination_entry
|
|
24
25
|
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
|
|
@@ -75,3 +76,4 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
|
|
|
75
76
|
add_destination_entry(
|
|
76
77
|
destination_type=SINGLESTORE_CONNECTOR_TYPE, entry=singlestore_destination_entry
|
|
77
78
|
)
|
|
79
|
+
add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import multiprocessing as mp
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from dateutil import parser
|
|
9
|
+
|
|
10
|
+
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
11
|
+
from unstructured_ingest.error import WriteError
|
|
12
|
+
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
13
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
+
from unstructured_ingest.v2.interfaces import (
|
|
15
|
+
AccessConfig,
|
|
16
|
+
ConnectionConfig,
|
|
17
|
+
FileData,
|
|
18
|
+
UploadContent,
|
|
19
|
+
Uploader,
|
|
20
|
+
UploaderConfig,
|
|
21
|
+
UploadStager,
|
|
22
|
+
UploadStagerConfig,
|
|
23
|
+
)
|
|
24
|
+
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
26
|
+
DestinationRegistryEntry,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from pymilvus import MilvusClient
|
|
31
|
+
|
|
32
|
+
CONNECTOR_TYPE = "milvus"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class MilvusAccessConfig(AccessConfig):
|
|
37
|
+
password: Optional[str] = None
|
|
38
|
+
token: Optional[str] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class MilvusConnectionConfig(ConnectionConfig):
|
|
43
|
+
access_config: MilvusAccessConfig = enhanced_field(
|
|
44
|
+
sensitive=True, default_factory=lambda: MilvusAccessConfig()
|
|
45
|
+
)
|
|
46
|
+
uri: Optional[str] = None
|
|
47
|
+
user: Optional[str] = None
|
|
48
|
+
db_name: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
def get_connection_kwargs(self) -> dict[str, Any]:
|
|
51
|
+
access_config_dict = self.access_config.to_dict()
|
|
52
|
+
connection_config_dict = self.to_dict()
|
|
53
|
+
connection_config_dict.pop("access_config", None)
|
|
54
|
+
connection_config_dict.update(access_config_dict)
|
|
55
|
+
# Drop any that were not set explicitly
|
|
56
|
+
connection_config_dict = {k: v for k, v in connection_config_dict.items() if v is not None}
|
|
57
|
+
return connection_config_dict
|
|
58
|
+
|
|
59
|
+
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
60
|
+
def get_client(self) -> "MilvusClient":
|
|
61
|
+
from pymilvus import MilvusClient
|
|
62
|
+
|
|
63
|
+
return MilvusClient(**self.get_connection_kwargs())
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class MilvusUploadStagerConfig(UploadStagerConfig):
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class MilvusUploadStager(UploadStager):
|
|
73
|
+
upload_stager_config: MilvusUploadStagerConfig = field(
|
|
74
|
+
default_factory=lambda: MilvusUploadStagerConfig()
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def parse_date_string(date_string: str) -> float:
|
|
79
|
+
try:
|
|
80
|
+
timestamp = float(date_string)
|
|
81
|
+
return timestamp
|
|
82
|
+
except ValueError:
|
|
83
|
+
pass
|
|
84
|
+
return parser.parse(date_string).timestamp()
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def conform_dict(cls, data: dict) -> None:
|
|
88
|
+
datetime_columns = [
|
|
89
|
+
"data_source_date_created",
|
|
90
|
+
"data_source_date_modified",
|
|
91
|
+
"data_source_date_processed",
|
|
92
|
+
"last_modified",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
json_dumps_fields = ["languages", "data_source_permissions_data"]
|
|
96
|
+
|
|
97
|
+
# TODO: milvus sdk doesn't seem to support defaults via the schema yet,
|
|
98
|
+
# remove once that gets updated
|
|
99
|
+
defaults = {"is_continuation": False}
|
|
100
|
+
|
|
101
|
+
if metadata := data.pop("metadata", None):
|
|
102
|
+
data.update(flatten_dict(metadata, keys_to_omit=["data_source_record_locator"]))
|
|
103
|
+
for datetime_column in datetime_columns:
|
|
104
|
+
if datetime_column in data:
|
|
105
|
+
data[datetime_column] = cls.parse_date_string(data[datetime_column])
|
|
106
|
+
for json_dumps_field in json_dumps_fields:
|
|
107
|
+
if json_dumps_field in data:
|
|
108
|
+
data[json_dumps_field] = json.dumps(data[json_dumps_field])
|
|
109
|
+
for default in defaults:
|
|
110
|
+
if default not in data:
|
|
111
|
+
data[default] = defaults[default]
|
|
112
|
+
|
|
113
|
+
def run(
|
|
114
|
+
self,
|
|
115
|
+
elements_filepath: Path,
|
|
116
|
+
file_data: FileData,
|
|
117
|
+
output_dir: Path,
|
|
118
|
+
output_filename: str,
|
|
119
|
+
**kwargs: Any,
|
|
120
|
+
) -> Path:
|
|
121
|
+
with open(elements_filepath) as elements_file:
|
|
122
|
+
elements_contents: list[dict[str, Any]] = json.load(elements_file)
|
|
123
|
+
for element in elements_contents:
|
|
124
|
+
self.conform_dict(data=element)
|
|
125
|
+
|
|
126
|
+
output_path = Path(output_dir) / Path(f"{output_filename}.json")
|
|
127
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
128
|
+
with output_path.open("w") as output_file:
|
|
129
|
+
json.dump(elements_contents, output_file, indent=2)
|
|
130
|
+
return output_path
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass
|
|
134
|
+
class MilvusUploaderConfig(UploaderConfig):
|
|
135
|
+
collection_name: str
|
|
136
|
+
num_of_processes: int = 4
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass
|
|
140
|
+
class MilvusUploader(Uploader):
|
|
141
|
+
connection_config: MilvusConnectionConfig
|
|
142
|
+
upload_config: MilvusUploaderConfig
|
|
143
|
+
connector_type: str = CONNECTOR_TYPE
|
|
144
|
+
|
|
145
|
+
def upload(self, content: UploadContent) -> None:
|
|
146
|
+
file_extension = content.path.suffix
|
|
147
|
+
if file_extension == ".json":
|
|
148
|
+
self.upload_json(content=content)
|
|
149
|
+
elif file_extension == ".csv":
|
|
150
|
+
self.upload_csv(content=content)
|
|
151
|
+
else:
|
|
152
|
+
raise ValueError(f"Unsupported file extension: {file_extension}")
|
|
153
|
+
|
|
154
|
+
@requires_dependencies(["pymilvus"], extras="milvus")
|
|
155
|
+
def insert_results(self, data: Union[dict, list[dict]]):
|
|
156
|
+
from pymilvus import MilvusException
|
|
157
|
+
|
|
158
|
+
logger.debug(
|
|
159
|
+
f"uploading {len(data)} entries to {self.connection_config.db_name} "
|
|
160
|
+
f"db in collection {self.upload_config.collection_name}"
|
|
161
|
+
)
|
|
162
|
+
client = self.connection_config.get_client()
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
res = client.insert(collection_name=self.upload_config.collection_name, data=data)
|
|
166
|
+
except MilvusException as milvus_exception:
|
|
167
|
+
raise WriteError("failed to upload records to milvus") from milvus_exception
|
|
168
|
+
if "err_count" in res and isinstance(res["err_count"], int) and res["err_count"] > 0:
|
|
169
|
+
err_count = res["err_count"]
|
|
170
|
+
raise WriteError(f"failed to upload {err_count} docs")
|
|
171
|
+
|
|
172
|
+
def upload_csv(self, content: UploadContent) -> None:
|
|
173
|
+
df = pd.read_csv(content.path)
|
|
174
|
+
data = df.to_dict(orient="records")
|
|
175
|
+
self.insert_results(data=data)
|
|
176
|
+
|
|
177
|
+
def upload_json(self, content: UploadContent) -> None:
|
|
178
|
+
with content.path.open("r") as file:
|
|
179
|
+
data: list[dict] = json.load(file)
|
|
180
|
+
self.insert_results(data=data)
|
|
181
|
+
|
|
182
|
+
def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
|
|
183
|
+
if self.upload_config.num_of_processes == 1:
|
|
184
|
+
for content in contents:
|
|
185
|
+
self.upload(content=content)
|
|
186
|
+
|
|
187
|
+
else:
|
|
188
|
+
with mp.Pool(
|
|
189
|
+
processes=self.upload_config.num_of_processes,
|
|
190
|
+
) as pool:
|
|
191
|
+
pool.map(self.upload, contents)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
milvus_destination_entry = DestinationRegistryEntry(
|
|
195
|
+
connection_config=MilvusConnectionConfig,
|
|
196
|
+
uploader=MilvusUploader,
|
|
197
|
+
uploader_config=MilvusUploaderConfig,
|
|
198
|
+
upload_stager=MilvusUploadStager,
|
|
199
|
+
upload_stager_config=MilvusUploadStagerConfig,
|
|
200
|
+
)
|
|
@@ -5,10 +5,6 @@ from dataclasses import dataclass, field
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Optional
|
|
7
7
|
|
|
8
|
-
from unstructured.ingest.v2.logger import logger
|
|
9
|
-
from unstructured.ingest.v2.processes.connector_registry import (
|
|
10
|
-
DestinationRegistryEntry,
|
|
11
|
-
)
|
|
12
8
|
from unstructured.staging.base import flatten_dict
|
|
13
9
|
from unstructured.utils import requires_dependencies
|
|
14
10
|
|
|
@@ -24,6 +20,10 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
24
20
|
UploadStager,
|
|
25
21
|
UploadStagerConfig,
|
|
26
22
|
)
|
|
23
|
+
from unstructured_ingest.v2.logger import logger
|
|
24
|
+
from unstructured_ingest.v2.processes.connector_registry import (
|
|
25
|
+
DestinationRegistryEntry,
|
|
26
|
+
)
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
29
|
from pinecone import Index as PineconeIndex
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.1
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -29,22 +29,22 @@ Requires-Dist: pyairtable ; extra == 'airtable'
|
|
|
29
29
|
Provides-Extra: astra
|
|
30
30
|
Requires-Dist: astrapy ; extra == 'astra'
|
|
31
31
|
Provides-Extra: azure
|
|
32
|
-
Requires-Dist: fsspec ; extra == 'azure'
|
|
33
32
|
Requires-Dist: adlfs ; extra == 'azure'
|
|
33
|
+
Requires-Dist: fsspec ; extra == 'azure'
|
|
34
34
|
Provides-Extra: azure-cognitive-search
|
|
35
35
|
Requires-Dist: azure-search-documents ; extra == 'azure-cognitive-search'
|
|
36
36
|
Provides-Extra: bedrock
|
|
37
|
-
Requires-Dist: boto3 ; extra == 'bedrock'
|
|
38
37
|
Requires-Dist: langchain-community ; extra == 'bedrock'
|
|
38
|
+
Requires-Dist: boto3 ; extra == 'bedrock'
|
|
39
39
|
Provides-Extra: biomed
|
|
40
40
|
Requires-Dist: bs4 ; extra == 'biomed'
|
|
41
41
|
Provides-Extra: box
|
|
42
|
-
Requires-Dist: boxfs ; extra == 'box'
|
|
43
42
|
Requires-Dist: fsspec ; extra == 'box'
|
|
43
|
+
Requires-Dist: boxfs ; extra == 'box'
|
|
44
44
|
Provides-Extra: chroma
|
|
45
|
-
Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
|
|
46
45
|
Requires-Dist: typer <=0.9.0 ; extra == 'chroma'
|
|
47
46
|
Requires-Dist: chromadb ; extra == 'chroma'
|
|
47
|
+
Requires-Dist: importlib-metadata >=7.1.0 ; extra == 'chroma'
|
|
48
48
|
Provides-Extra: clarifai
|
|
49
49
|
Requires-Dist: clarifai ; extra == 'clarifai'
|
|
50
50
|
Provides-Extra: confluence
|
|
@@ -72,21 +72,21 @@ Requires-Dist: huggingface ; extra == 'embed-huggingface'
|
|
|
72
72
|
Requires-Dist: langchain-community ; extra == 'embed-huggingface'
|
|
73
73
|
Requires-Dist: sentence-transformers ; extra == 'embed-huggingface'
|
|
74
74
|
Provides-Extra: embed-octoai
|
|
75
|
-
Requires-Dist: openai ; extra == 'embed-octoai'
|
|
76
75
|
Requires-Dist: tiktoken ; extra == 'embed-octoai'
|
|
76
|
+
Requires-Dist: openai ; extra == 'embed-octoai'
|
|
77
77
|
Provides-Extra: embed-vertexai
|
|
78
|
-
Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
|
|
79
78
|
Requires-Dist: langchain-community ; extra == 'embed-vertexai'
|
|
80
79
|
Requires-Dist: langchain ; extra == 'embed-vertexai'
|
|
80
|
+
Requires-Dist: langchain-google-vertexai ; extra == 'embed-vertexai'
|
|
81
81
|
Provides-Extra: embed-voyageai
|
|
82
82
|
Requires-Dist: langchain-voyageai ; extra == 'embed-voyageai'
|
|
83
83
|
Requires-Dist: langchain ; extra == 'embed-voyageai'
|
|
84
84
|
Provides-Extra: epub
|
|
85
85
|
Requires-Dist: unstructured[epub] ; extra == 'epub'
|
|
86
86
|
Provides-Extra: gcs
|
|
87
|
-
Requires-Dist: fsspec ; extra == 'gcs'
|
|
88
87
|
Requires-Dist: bs4 ; extra == 'gcs'
|
|
89
88
|
Requires-Dist: gcsfs ; extra == 'gcs'
|
|
89
|
+
Requires-Dist: fsspec ; extra == 'gcs'
|
|
90
90
|
Provides-Extra: github
|
|
91
91
|
Requires-Dist: pygithub >1.58.0 ; extra == 'github'
|
|
92
92
|
Provides-Extra: gitlab
|
|
@@ -94,38 +94,40 @@ Requires-Dist: python-gitlab ; extra == 'gitlab'
|
|
|
94
94
|
Provides-Extra: google-drive
|
|
95
95
|
Requires-Dist: google-api-python-client ; extra == 'google-drive'
|
|
96
96
|
Provides-Extra: hubspot
|
|
97
|
-
Requires-Dist: urllib3 ; extra == 'hubspot'
|
|
98
97
|
Requires-Dist: hubspot-api-client ; extra == 'hubspot'
|
|
98
|
+
Requires-Dist: urllib3 ; extra == 'hubspot'
|
|
99
99
|
Provides-Extra: jira
|
|
100
100
|
Requires-Dist: atlassian-python-api ; extra == 'jira'
|
|
101
101
|
Provides-Extra: kafka
|
|
102
102
|
Requires-Dist: confluent-kafka ; extra == 'kafka'
|
|
103
103
|
Provides-Extra: md
|
|
104
104
|
Requires-Dist: unstructured[md] ; extra == 'md'
|
|
105
|
+
Provides-Extra: milvus
|
|
106
|
+
Requires-Dist: pymilvus ; extra == 'milvus'
|
|
105
107
|
Provides-Extra: mongodb
|
|
106
108
|
Requires-Dist: pymongo ; extra == 'mongodb'
|
|
107
109
|
Provides-Extra: msg
|
|
108
110
|
Requires-Dist: unstructured[msg] ; extra == 'msg'
|
|
109
111
|
Provides-Extra: notion
|
|
110
|
-
Requires-Dist: notion-client ; extra == 'notion'
|
|
111
112
|
Requires-Dist: htmlBuilder ; extra == 'notion'
|
|
113
|
+
Requires-Dist: notion-client ; extra == 'notion'
|
|
112
114
|
Provides-Extra: odt
|
|
113
115
|
Requires-Dist: unstructured[odt] ; extra == 'odt'
|
|
114
116
|
Provides-Extra: onedrive
|
|
115
|
-
Requires-Dist: msal ; extra == 'onedrive'
|
|
116
117
|
Requires-Dist: bs4 ; extra == 'onedrive'
|
|
117
118
|
Requires-Dist: Office365-REST-Python-Client ; extra == 'onedrive'
|
|
119
|
+
Requires-Dist: msal ; extra == 'onedrive'
|
|
118
120
|
Provides-Extra: openai
|
|
119
|
-
Requires-Dist: openai ; extra == 'openai'
|
|
120
121
|
Requires-Dist: tiktoken ; extra == 'openai'
|
|
121
122
|
Requires-Dist: langchain-community ; extra == 'openai'
|
|
123
|
+
Requires-Dist: openai ; extra == 'openai'
|
|
122
124
|
Provides-Extra: opensearch
|
|
123
125
|
Requires-Dist: opensearch-py ; extra == 'opensearch'
|
|
124
126
|
Provides-Extra: org
|
|
125
127
|
Requires-Dist: unstructured[org] ; extra == 'org'
|
|
126
128
|
Provides-Extra: outlook
|
|
127
|
-
Requires-Dist: msal ; extra == 'outlook'
|
|
128
129
|
Requires-Dist: Office365-REST-Python-Client ; extra == 'outlook'
|
|
130
|
+
Requires-Dist: msal ; extra == 'outlook'
|
|
129
131
|
Provides-Extra: pdf
|
|
130
132
|
Requires-Dist: unstructured[pdf] ; extra == 'pdf'
|
|
131
133
|
Provides-Extra: pinecone
|
|
@@ -150,11 +152,11 @@ Requires-Dist: s3fs ; extra == 's3'
|
|
|
150
152
|
Provides-Extra: salesforce
|
|
151
153
|
Requires-Dist: simple-salesforce ; extra == 'salesforce'
|
|
152
154
|
Provides-Extra: sftp
|
|
153
|
-
Requires-Dist: fsspec ; extra == 'sftp'
|
|
154
155
|
Requires-Dist: paramiko ; extra == 'sftp'
|
|
156
|
+
Requires-Dist: fsspec ; extra == 'sftp'
|
|
155
157
|
Provides-Extra: sharepoint
|
|
156
|
-
Requires-Dist: msal ; extra == 'sharepoint'
|
|
157
158
|
Requires-Dist: Office365-REST-Python-Client ; extra == 'sharepoint'
|
|
159
|
+
Requires-Dist: msal ; extra == 'sharepoint'
|
|
158
160
|
Provides-Extra: singlestore
|
|
159
161
|
Requires-Dist: singlestoredb ; extra == 'singlestore'
|
|
160
162
|
Provides-Extra: slack
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=SI019rW6paHw93e6fOWFzF9TruLom8o9HrgZsjGZvaE,42
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/evaluate.py,sha256=R-mKLFXbVX1xQ1tjGsLHjdP-TbSSV-925IHzggW_bIg,9793
|
|
5
5
|
unstructured_ingest/interfaces.py,sha256=uS8L5mS0mXD8I4XTfVlKZxAwqnpJ4yrRqn4vxWVRhQI,31107
|
|
6
|
-
unstructured_ingest/logger.py,sha256=
|
|
6
|
+
unstructured_ingest/logger.py,sha256=TrhyH7VbCWO5VVuhvL0yUyXxuem3b4pzbqj2uQHUwZk,4480
|
|
7
7
|
unstructured_ingest/main.py,sha256=82G_7eG4PNhc_xIqj4Y_sFbDV9VI-nwSfsfJQMzovMk,169
|
|
8
8
|
unstructured_ingest/processor.py,sha256=XKKrvbxsb--5cDzz4hB3-GfWZYyIjJ2ah8FpzQKF_DM,2760
|
|
9
9
|
unstructured_ingest/cli/__init__.py,sha256=9kNcBOHuXON5lB1MJU9QewEhwPmId56vXqB29-kqEAA,302
|
|
@@ -104,7 +104,7 @@ unstructured_ingest/connector/fsspec/sftp.py,sha256=x2w8JGM81S_HXww7Aa-bTY1LjZSi
|
|
|
104
104
|
unstructured_ingest/connector/notion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
105
105
|
unstructured_ingest/connector/notion/client.py,sha256=vU1GE64ktEAM4b-jo8UnMAwz60KSiQ6iRI3De3ixNdI,8689
|
|
106
106
|
unstructured_ingest/connector/notion/connector.py,sha256=E-t7q5XAiYP9xk-1aqIqcGwdJOH8UNgiE0HcH9Oc4i4,17475
|
|
107
|
-
unstructured_ingest/connector/notion/helpers.py,sha256=
|
|
107
|
+
unstructured_ingest/connector/notion/helpers.py,sha256=jqg5-cPcrjm5G7dBF4jOjHxdJN1CI7yBeTefiX4hBoM,20702
|
|
108
108
|
unstructured_ingest/connector/notion/interfaces.py,sha256=SrTT-9c0nvk0fMqVgudYF647r04AdMKi6wkIkMy7Szw,563
|
|
109
109
|
unstructured_ingest/connector/notion/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
110
110
|
unstructured_ingest/connector/notion/types/block.py,sha256=AKOY-o6CTFC-caWlkLfKskMuFemH4-Vdrhv7HnRkS8w,3009
|
|
@@ -253,7 +253,7 @@ unstructured_ingest/utils/string_and_date_utils.py,sha256=hnGglD8Z626vLhH_UV4Qyb
|
|
|
253
253
|
unstructured_ingest/utils/table.py,sha256=aWjcowDVSClNpEAdR6PY3H7khKu4T6T3QqQE6GjmQ_M,3469
|
|
254
254
|
unstructured_ingest/v2/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
255
255
|
unstructured_ingest/v2/example.py,sha256=qkwmpMxUlaJXdDNKQ4LlUt3XGxgTUU3CXGGO57eW5Gs,1644
|
|
256
|
-
unstructured_ingest/v2/logger.py,sha256=
|
|
256
|
+
unstructured_ingest/v2/logger.py,sha256=akcghdHwpKM3CfoeFzir0zmc7R9Hk7zjquU-X-gwUIw,4324
|
|
257
257
|
unstructured_ingest/v2/main.py,sha256=WFdLEqEXRy6E9_G-dF20MK2AtgX51Aan1sp_N67U2B8,172
|
|
258
258
|
unstructured_ingest/v2/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
259
259
|
unstructured_ingest/v2/cli/cli.py,sha256=qHXIs-PcvMgDZhP1AR9iDMxh8FXBMJCEDksPBfiMULE,648
|
|
@@ -264,7 +264,7 @@ unstructured_ingest/v2/cli/base/cmd.py,sha256=qVHmquVsVDoYyPByKdUTVCwAFfILMYBw5w
|
|
|
264
264
|
unstructured_ingest/v2/cli/base/dest.py,sha256=YMbVIHmYDqvOtxZeEY93stmF2p2ImjuJts7-u-NznYw,2887
|
|
265
265
|
unstructured_ingest/v2/cli/base/importer.py,sha256=nRt0QQ3qpi264-n_mR0l55C2ddM8nowTNzT1jsWaam8,1128
|
|
266
266
|
unstructured_ingest/v2/cli/base/src.py,sha256=7LnZh9FgUX9rerBH6cizVtTWmM6R2sRkxatnGsxYHG0,2410
|
|
267
|
-
unstructured_ingest/v2/cli/cmds/__init__.py,sha256=
|
|
267
|
+
unstructured_ingest/v2/cli/cmds/__init__.py,sha256=DWPMD6Wqus22sSoIEyTSiOJAm97aNjvdpdrXgsL4uQ0,2647
|
|
268
268
|
unstructured_ingest/v2/cli/cmds/astra.py,sha256=L-GR2KSP_cFQkQm0aVcdiXmgYMJZCVKIAH794y8qT1M,2590
|
|
269
269
|
unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py,sha256=VTCSUYeIYKnP60lC7DeBYqoqAJnWuBZrwevCXbeIEzw,2248
|
|
270
270
|
unstructured_ingest/v2/cli/cmds/chroma.py,sha256=RinNOPripk2zRYx1Rt-u-jywXbwh7JsidVia4F0-wyU,3359
|
|
@@ -272,6 +272,7 @@ unstructured_ingest/v2/cli/cmds/databricks_volumes.py,sha256=53d9A7UunJLYZFwwwHE
|
|
|
272
272
|
unstructured_ingest/v2/cli/cmds/elasticsearch.py,sha256=joUfnV992fAwEDCtFVJaABwgpyQiWeDl1ZCBEudRtnk,5258
|
|
273
273
|
unstructured_ingest/v2/cli/cmds/google_drive.py,sha256=mXozabpi8kjRFb0S7kw-xMGtEuFoVUxnvefwL5ZIPHc,2334
|
|
274
274
|
unstructured_ingest/v2/cli/cmds/local.py,sha256=lGBFOVDRlrcCtPFjyk0IAYHLRWg95Kunu1Kli7t0ZK4,1899
|
|
275
|
+
unstructured_ingest/v2/cli/cmds/milvus.py,sha256=PB1ib1rFGGH_-KDi1bSIO3BIiVcqSJEHCBFFrzQrnmI,1998
|
|
275
276
|
unstructured_ingest/v2/cli/cmds/mongodb.py,sha256=oyV6tacuuxm3dN-AXQgbxvYJiDYo2OOWQKRSBCUGj0E,1823
|
|
276
277
|
unstructured_ingest/v2/cli/cmds/onedrive.py,sha256=DKqhQyyF-swZxs3C9G5W8ECleq8sWpDbpTuiAHXukXQ,2781
|
|
277
278
|
unstructured_ingest/v2/cli/cmds/opensearch.py,sha256=7zl8dUXzxs24MDRRASKfNc14IDM798qOXRl2FZdXG1I,3064
|
|
@@ -304,7 +305,7 @@ unstructured_ingest/v2/interfaces/processor.py,sha256=uHVHeKo5Gt_zFkaEXw7xgaCBDT
|
|
|
304
305
|
unstructured_ingest/v2/interfaces/upload_stager.py,sha256=SylhDl9pK6qa7hvfrhpabCkjwE03yIlI6oM-mQnqtho,1220
|
|
305
306
|
unstructured_ingest/v2/interfaces/uploader.py,sha256=bzfx3Ei4poXKu-hsgjAB4sj4jKij9CoaRSadUM5LtGk,1083
|
|
306
307
|
unstructured_ingest/v2/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
307
|
-
unstructured_ingest/v2/pipeline/interfaces.py,sha256=
|
|
308
|
+
unstructured_ingest/v2/pipeline/interfaces.py,sha256=Zz76fLHNKw6BDsBSYQXiRa6CvyW91ulvZU0yw5vVQ5M,6544
|
|
308
309
|
unstructured_ingest/v2/pipeline/pipeline.py,sha256=r8jRMZI2RF8GQIuTcjIFBDeFtMnqpOJmKhEriy6Vo5Y,11616
|
|
309
310
|
unstructured_ingest/v2/pipeline/utils.py,sha256=oPAitfdnITqh2O8Z0uf6VOHg9BTJhitRzNmKXqTwPxg,422
|
|
310
311
|
unstructured_ingest/v2/pipeline/steps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -322,7 +323,7 @@ unstructured_ingest/v2/processes/connector_registry.py,sha256=KOrvJNNRdpBPyqFwmT
|
|
|
322
323
|
unstructured_ingest/v2/processes/embedder.py,sha256=QjAsiXAjWtZzh6lJ4D5LsTMBD81zuMBkegXNWq-FZt0,3308
|
|
323
324
|
unstructured_ingest/v2/processes/partitioner.py,sha256=f6UQoQHVKjl8rmM5J9EcuP30RTFLSLrArGdC6qh-ffE,7645
|
|
324
325
|
unstructured_ingest/v2/processes/uncompress.py,sha256=x-JZYNs1zJOtRS7xNgiMyrYoAbzKM0p18O8NAl7avCA,1631
|
|
325
|
-
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=
|
|
326
|
+
unstructured_ingest/v2/processes/connectors/__init__.py,sha256=7QMKd8gtEJTIuK352Ho6XyoFvLLhrWIzgdu0dXwXWOE,3960
|
|
326
327
|
unstructured_ingest/v2/processes/connectors/astra.py,sha256=TSI_3GHnEh3gYAC30RTG4b2eEB07agroEFmJ38GnQY4,4903
|
|
327
328
|
unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py,sha256=PT02ZKiJuHMrmBClxqBsyDS0aXUQYLVg02Ns2qh1hD4,7935
|
|
328
329
|
unstructured_ingest/v2/processes/connectors/chroma.py,sha256=nYzNz-8oq-DN0Z4r7lHQFmlved76IaYeRvm7-EmbGUE,6998
|
|
@@ -330,10 +331,11 @@ unstructured_ingest/v2/processes/connectors/databricks_volumes.py,sha256=MTLK7Sv
|
|
|
330
331
|
unstructured_ingest/v2/processes/connectors/elasticsearch.py,sha256=6QBvVzPk3mWj9ZqJZN7NvhcJaOO6nSLqLwU6zggP59A,14864
|
|
331
332
|
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=IkLVafUu280OOoqYmdfdfMB6zlpiWjs2Z5J31ZzJOj4,12681
|
|
332
333
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=maAXVKpRRXj_jseC6EPLTosMgw6ll-0lnGsDdAFLWAE,6646
|
|
334
|
+
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=FWH4FH-zns7gh8sITg9pLYE9uKm_3GeOXJ4wjY6PMno,6776
|
|
333
335
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=ErZWAD-su3OCRGv1h84X1PpAWleUPVZcFDEIYjtyP4E,4310
|
|
334
336
|
unstructured_ingest/v2/processes/connectors/onedrive.py,sha256=WDDoFEfd8M_QBTpkGNI2zZGZZ_CR1rQiCsBWYOO2JoA,8311
|
|
335
337
|
unstructured_ingest/v2/processes/connectors/opensearch.py,sha256=HNRZVQsWnjLLm0yAGiIyHRbhAsBnGSXBO_VkUfIdwdE,5463
|
|
336
|
-
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=
|
|
338
|
+
unstructured_ingest/v2/processes/connectors/pinecone.py,sha256=PtAodxemYgiBZESx-g9a8fcL6XagJd9DIDQjrhE8aPk,5746
|
|
337
339
|
unstructured_ingest/v2/processes/connectors/salesforce.py,sha256=Cz4qEtnbsD9-m1DXANxnVRZTHX2ZaUUBPVFPu5wnFRk,10832
|
|
338
340
|
unstructured_ingest/v2/processes/connectors/sharepoint.py,sha256=SNovgGUE5tHdfX_lF5zwM_QRZK7mahHzLZKhnqfk6Tc,17696
|
|
339
341
|
unstructured_ingest/v2/processes/connectors/singlestore.py,sha256=upF2O4hJ2uiBhDRrpQ8CSJUvzmqu2j5H1b_QbReHJpw,5168
|
|
@@ -349,8 +351,8 @@ unstructured_ingest/v2/processes/connectors/fsspec/gcs.py,sha256=RYZq_8hKF7bRxuB
|
|
|
349
351
|
unstructured_ingest/v2/processes/connectors/fsspec/s3.py,sha256=7lOm5hjb0LBkbe-OWXnV3wDC-3mM_GWwwmdKW0xzh8c,5333
|
|
350
352
|
unstructured_ingest/v2/processes/connectors/fsspec/sftp.py,sha256=J7Ej-j7dtXAluHunwynUfHlNsYwymb-LsrGUFcljcsA,5700
|
|
351
353
|
unstructured_ingest/v2/processes/connectors/fsspec/utils.py,sha256=jec_Qfe2hbfahBuY-u8FnvHuv933AI5HwPFjOL3kEEY,456
|
|
352
|
-
unstructured_ingest-0.0.
|
|
353
|
-
unstructured_ingest-0.0.
|
|
354
|
-
unstructured_ingest-0.0.
|
|
355
|
-
unstructured_ingest-0.0.
|
|
356
|
-
unstructured_ingest-0.0.
|
|
354
|
+
unstructured_ingest-0.0.1.dist-info/METADATA,sha256=Qru27Cxrf0C-vFe7MqfaKOfavazrWYTTRif6loKf71o,21568
|
|
355
|
+
unstructured_ingest-0.0.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
356
|
+
unstructured_ingest-0.0.1.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
357
|
+
unstructured_ingest-0.0.1.dist-info/top_level.txt,sha256=QaTxTcjfM5Hr9sZJ6weOJvSe5ESQc0F8AWkhHInTCf8,20
|
|
358
|
+
unstructured_ingest-0.0.1.dist-info/RECORD,,
|
|
File without changes
|
{unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|