unstructured-ingest 0.5.20__py3-none-any.whl → 0.5.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_astradb.py +8 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/interfaces.py +7 -3
- unstructured_ingest/utils/data_prep.py +17 -5
- unstructured_ingest/utils/table.py +11 -4
- unstructured_ingest/v2/processes/connectors/delta_table.py +8 -3
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +4 -3
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +5 -2
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +5 -2
- unstructured_ingest/v2/processes/connectors/kdbai.py +6 -3
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +10 -2
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +5 -3
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +5 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +7 -3
- unstructured_ingest/v2/processes/connectors/sql/sql.py +22 -9
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +5 -1
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +5 -7
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.21.dist-info}/METADATA +175 -24
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.21.dist-info}/RECORD +23 -23
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.21.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.21.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.21.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.21.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import contextlib
|
|
1
2
|
import json
|
|
2
3
|
import os
|
|
3
4
|
from dataclasses import dataclass
|
|
@@ -231,6 +232,13 @@ def test_astra_create_destination():
|
|
|
231
232
|
)
|
|
232
233
|
collection_name = "system_created-123"
|
|
233
234
|
formatted_collection_name = "system_created_123"
|
|
235
|
+
|
|
236
|
+
client = AstraDBClient()
|
|
237
|
+
db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
|
|
238
|
+
with contextlib.suppress(Exception):
|
|
239
|
+
# drop collection before trying to create it
|
|
240
|
+
db.drop_collection(formatted_collection_name)
|
|
241
|
+
|
|
234
242
|
created = uploader.create_destination(destination_name=collection_name, vector_length=3072)
|
|
235
243
|
assert created
|
|
236
244
|
assert uploader.upload_config.collection_name == formatted_collection_name
|
|
@@ -239,8 +247,6 @@ def test_astra_create_destination():
|
|
|
239
247
|
assert not created
|
|
240
248
|
|
|
241
249
|
# cleanup
|
|
242
|
-
client = AstraDBClient()
|
|
243
|
-
db = client.get_database(api_endpoint=env_data.api_endpoint, token=env_data.token)
|
|
244
250
|
db.drop_collection(formatted_collection_name)
|
|
245
251
|
|
|
246
252
|
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.5.
|
|
1
|
+
__version__ = "0.5.21" # pragma: no cover
|
|
@@ -2,10 +2,10 @@ from abc import ABC
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from typing import Any, Optional
|
|
4
4
|
|
|
5
|
-
import numpy as np
|
|
6
5
|
from pydantic import BaseModel, Field
|
|
7
6
|
|
|
8
7
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
|
|
10
10
|
EMBEDDINGS_KEY = "embeddings"
|
|
11
11
|
|
|
@@ -32,7 +32,6 @@ class BaseEncoder(ABC):
|
|
|
32
32
|
|
|
33
33
|
@dataclass
|
|
34
34
|
class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
35
|
-
|
|
36
35
|
def initialize(self):
|
|
37
36
|
"""Initializes the embedding encoder class. Should also validate the instance
|
|
38
37
|
is properly configured: e.g., embed a single a element"""
|
|
@@ -46,8 +45,11 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
46
45
|
return self.embed_query(query="Q")
|
|
47
46
|
|
|
48
47
|
@property
|
|
48
|
+
@requires_dependencies(["numpy"])
|
|
49
49
|
def is_unit_vector(self) -> bool:
|
|
50
50
|
"""Denotes if the embedding vector is a unit vector."""
|
|
51
|
+
import numpy as np
|
|
52
|
+
|
|
51
53
|
exemplary_embedding = self.get_exemplary_embedding()
|
|
52
54
|
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
|
|
53
55
|
|
|
@@ -86,7 +88,6 @@ class BaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
86
88
|
|
|
87
89
|
@dataclass
|
|
88
90
|
class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
89
|
-
|
|
90
91
|
async def initialize(self):
|
|
91
92
|
"""Initializes the embedding encoder class. Should also validate the instance
|
|
92
93
|
is properly configured: e.g., embed a single a element"""
|
|
@@ -100,8 +101,11 @@ class AsyncBaseEmbeddingEncoder(BaseEncoder, ABC):
|
|
|
100
101
|
return await self.embed_query(query="Q")
|
|
101
102
|
|
|
102
103
|
@property
|
|
104
|
+
@requires_dependencies(["numpy"])
|
|
103
105
|
async def is_unit_vector(self) -> bool:
|
|
104
106
|
"""Denotes if the embedding vector is a unit vector."""
|
|
107
|
+
import numpy as np
|
|
108
|
+
|
|
105
109
|
exemplary_embedding = await self.get_exemplary_embedding()
|
|
106
110
|
return np.isclose(np.linalg.norm(exemplary_embedding), 1.0, rtol=1e-03)
|
|
107
111
|
|
|
@@ -2,20 +2,22 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
|
|
6
|
-
|
|
7
|
-
import pandas as pd
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
|
|
8
6
|
|
|
9
7
|
from unstructured_ingest.utils import ndjson
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
9
|
from unstructured_ingest.v2.logger import logger
|
|
11
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pandas import DataFrame
|
|
13
|
+
|
|
12
14
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
13
15
|
|
|
14
16
|
T = TypeVar("T")
|
|
15
17
|
IterableT = Iterable[T]
|
|
16
18
|
|
|
17
19
|
|
|
18
|
-
def split_dataframe(df:
|
|
20
|
+
def split_dataframe(df: "DataFrame", chunk_size: int = 100) -> Generator["DataFrame", None, None]:
|
|
19
21
|
num_chunks = len(df) // chunk_size + 1
|
|
20
22
|
for i in range(num_chunks):
|
|
21
23
|
yield df[i * chunk_size : (i + 1) * chunk_size]
|
|
@@ -144,9 +146,13 @@ def get_data_by_suffix(path: Path) -> list[dict]:
|
|
|
144
146
|
elif path.suffix == ".ndjson":
|
|
145
147
|
return ndjson.load(f)
|
|
146
148
|
elif path.suffix == ".csv":
|
|
149
|
+
import pandas as pd
|
|
150
|
+
|
|
147
151
|
df = pd.read_csv(path)
|
|
148
152
|
return df.to_dict(orient="records")
|
|
149
153
|
elif path.suffix == ".parquet":
|
|
154
|
+
import pandas as pd
|
|
155
|
+
|
|
150
156
|
df = pd.read_parquet(path)
|
|
151
157
|
return df.to_dict(orient="records")
|
|
152
158
|
else:
|
|
@@ -180,6 +186,9 @@ def get_data(path: Union[Path, str]) -> list[dict]:
|
|
|
180
186
|
return ndjson.load(f)
|
|
181
187
|
except Exception as e:
|
|
182
188
|
logger.warning(f"failed to read {path} as ndjson: {e}")
|
|
189
|
+
|
|
190
|
+
import pandas as pd
|
|
191
|
+
|
|
183
192
|
try:
|
|
184
193
|
df = pd.read_csv(path)
|
|
185
194
|
return df.to_dict(orient="records")
|
|
@@ -202,7 +211,10 @@ def get_json_data(path: Path) -> list[dict]:
|
|
|
202
211
|
raise ValueError(f"Unsupported file type: {path}")
|
|
203
212
|
|
|
204
213
|
|
|
205
|
-
|
|
214
|
+
@requires_dependencies(["pandas"])
|
|
215
|
+
def get_data_df(path: Path) -> "DataFrame":
|
|
216
|
+
import pandas as pd
|
|
217
|
+
|
|
206
218
|
with path.open() as f:
|
|
207
219
|
if path.suffix == ".json":
|
|
208
220
|
data = json.load(f)
|
|
@@ -1,11 +1,16 @@
|
|
|
1
|
-
from typing import Any
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
|
4
2
|
|
|
5
3
|
from unstructured_ingest.utils.data_prep import flatten_dict
|
|
4
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from pandas import DataFrame
|
|
6
8
|
|
|
7
9
|
|
|
10
|
+
@requires_dependencies(["pandas"])
|
|
8
11
|
def get_default_pandas_dtypes() -> dict[str, Any]:
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
9
14
|
return {
|
|
10
15
|
"text": pd.StringDtype(), # type: ignore
|
|
11
16
|
"type": pd.StringDtype(), # type: ignore
|
|
@@ -57,7 +62,9 @@ def get_default_pandas_dtypes() -> dict[str, Any]:
|
|
|
57
62
|
def convert_to_pandas_dataframe(
|
|
58
63
|
elements_dict: list[dict[str, Any]],
|
|
59
64
|
drop_empty_cols: bool = False,
|
|
60
|
-
) ->
|
|
65
|
+
) -> "DataFrame":
|
|
66
|
+
import pandas as pd
|
|
67
|
+
|
|
61
68
|
# Flatten metadata if it hasn't already been flattened
|
|
62
69
|
for d in elements_dict:
|
|
63
70
|
if metadata := d.pop("metadata", None):
|
|
@@ -3,10 +3,9 @@ import traceback
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from multiprocessing import Process, Queue
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Optional
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Optional
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
|
-
import pandas as pd
|
|
10
9
|
from pydantic import Field, Secret
|
|
11
10
|
|
|
12
11
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -27,6 +26,9 @@ from unstructured_ingest.v2.processes.connector_registry import DestinationRegis
|
|
|
27
26
|
|
|
28
27
|
CONNECTOR_TYPE = "delta_table"
|
|
29
28
|
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from pandas import DataFrame
|
|
31
|
+
|
|
30
32
|
|
|
31
33
|
@requires_dependencies(["deltalake"], extras="delta-table")
|
|
32
34
|
def write_deltalake_with_error_handling(queue, **kwargs):
|
|
@@ -136,7 +138,7 @@ class DeltaTableUploader(Uploader):
|
|
|
136
138
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
137
139
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
138
140
|
|
|
139
|
-
def upload_dataframe(self, df:
|
|
141
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
140
142
|
updated_upload_path = os.path.join(
|
|
141
143
|
self.connection_config.table_uri, file_data.source_identifiers.relative_path
|
|
142
144
|
)
|
|
@@ -172,7 +174,10 @@ class DeltaTableUploader(Uploader):
|
|
|
172
174
|
logger.error(f"Exception occurred in write_deltalake: {error_message}")
|
|
173
175
|
raise RuntimeError(f"Error in write_deltalake: {error_message}")
|
|
174
176
|
|
|
177
|
+
@requires_dependencies(["pandas"], extras="delta-table")
|
|
175
178
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
179
|
+
import pandas as pd
|
|
180
|
+
|
|
176
181
|
df = pd.DataFrame(data=data)
|
|
177
182
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
178
183
|
|
|
@@ -2,9 +2,8 @@ from dataclasses import dataclass
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
|
-
import pandas as pd
|
|
6
|
-
|
|
7
5
|
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
6
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
8
7
|
from unstructured_ingest.v2.interfaces import FileData, UploadStager
|
|
9
8
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
10
9
|
|
|
@@ -55,7 +54,6 @@ _COLUMNS = (
|
|
|
55
54
|
|
|
56
55
|
@dataclass
|
|
57
56
|
class BaseDuckDBUploadStager(UploadStager):
|
|
58
|
-
|
|
59
57
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
60
58
|
data = element_dict.copy()
|
|
61
59
|
metadata: dict[str, Any] = data.pop("metadata", {})
|
|
@@ -72,6 +70,7 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
72
70
|
data = {k: v for k, v in data.items() if k in _COLUMNS}
|
|
73
71
|
return data
|
|
74
72
|
|
|
73
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
75
74
|
def run(
|
|
76
75
|
self,
|
|
77
76
|
elements_filepath: Path,
|
|
@@ -80,6 +79,8 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
80
79
|
output_filename: str,
|
|
81
80
|
**kwargs: Any,
|
|
82
81
|
) -> Path:
|
|
82
|
+
import pandas as pd
|
|
83
|
+
|
|
83
84
|
elements_contents = get_data(path=elements_filepath)
|
|
84
85
|
output_filename_suffix = Path(elements_filepath).suffix
|
|
85
86
|
output_filename = f"{Path(output_filename).stem}{output_filename_suffix}"
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUp
|
|
|
23
22
|
|
|
24
23
|
if TYPE_CHECKING:
|
|
25
24
|
from duckdb import DuckDBPyConnection as DuckDBConnection
|
|
25
|
+
from pandas import DataFrame
|
|
26
26
|
|
|
27
27
|
CONNECTOR_TYPE = "duckdb"
|
|
28
28
|
|
|
@@ -101,7 +101,7 @@ class DuckDBUploader(Uploader):
|
|
|
101
101
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
102
102
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
103
103
|
|
|
104
|
-
def upload_dataframe(self, df:
|
|
104
|
+
def upload_dataframe(self, df: "DataFrame") -> None:
|
|
105
105
|
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
106
106
|
|
|
107
107
|
with self.connection_config.get_client() as conn:
|
|
@@ -109,7 +109,10 @@ class DuckDBUploader(Uploader):
|
|
|
109
109
|
f"INSERT INTO {self.connection_config.db_schema}.{self.connection_config.table} BY NAME SELECT * FROM df" # noqa: E501
|
|
110
110
|
)
|
|
111
111
|
|
|
112
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
112
113
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
114
|
+
import pandas as pd
|
|
115
|
+
|
|
113
116
|
df = pd.DataFrame(data=data)
|
|
114
117
|
self.upload_dataframe(df=df)
|
|
115
118
|
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.__version__ import __version__ as unstructured_io_ingest_version
|
|
@@ -24,6 +23,7 @@ from unstructured_ingest.v2.processes.connectors.duckdb.base import BaseDuckDBUp
|
|
|
24
23
|
|
|
25
24
|
if TYPE_CHECKING:
|
|
26
25
|
from duckdb import DuckDBPyConnection as MotherDuckConnection
|
|
26
|
+
from pandas import DataFrame
|
|
27
27
|
|
|
28
28
|
CONNECTOR_TYPE = "motherduck"
|
|
29
29
|
|
|
@@ -100,7 +100,7 @@ class MotherDuckUploader(Uploader):
|
|
|
100
100
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
101
101
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
102
102
|
|
|
103
|
-
def upload_dataframe(self, df:
|
|
103
|
+
def upload_dataframe(self, df: "DataFrame") -> None:
|
|
104
104
|
logger.debug(f"uploading {len(df)} entries to {self.connection_config.database} ")
|
|
105
105
|
database = self.connection_config.database
|
|
106
106
|
db_schema = self.connection_config.db_schema
|
|
@@ -109,7 +109,10 @@ class MotherDuckUploader(Uploader):
|
|
|
109
109
|
with self.connection_config.get_client() as conn:
|
|
110
110
|
conn.query(f'INSERT INTO "{database}"."{db_schema}"."{table}" BY NAME SELECT * FROM df')
|
|
111
111
|
|
|
112
|
+
@requires_dependencies(["pandas"], extras="duckdb")
|
|
112
113
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
114
|
+
import pandas as pd
|
|
115
|
+
|
|
113
116
|
df = pd.DataFrame(data=data)
|
|
114
117
|
self.upload_dataframe(df=df)
|
|
115
118
|
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
9
8
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -26,6 +25,7 @@ from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
|
26
25
|
|
|
27
26
|
if TYPE_CHECKING:
|
|
28
27
|
from kdbai_client import Database, Session, Table
|
|
28
|
+
from pandas import DataFrame
|
|
29
29
|
|
|
30
30
|
CONNECTOR_TYPE = "kdbai"
|
|
31
31
|
|
|
@@ -118,11 +118,11 @@ class KdbaiUploader(Uploader):
|
|
|
118
118
|
table = db.table(self.upload_config.table_name)
|
|
119
119
|
yield table
|
|
120
120
|
|
|
121
|
-
def upsert_batch(self, batch:
|
|
121
|
+
def upsert_batch(self, batch: "DataFrame"):
|
|
122
122
|
with self.get_table() as table:
|
|
123
123
|
table.insert(batch)
|
|
124
124
|
|
|
125
|
-
def process_dataframe(self, df:
|
|
125
|
+
def process_dataframe(self, df: "DataFrame"):
|
|
126
126
|
logger.debug(
|
|
127
127
|
f"uploading {len(df)} entries to {self.connection_config.endpoint} "
|
|
128
128
|
f"db {self.upload_config.database_name} in table {self.upload_config.table_name}"
|
|
@@ -130,7 +130,10 @@ class KdbaiUploader(Uploader):
|
|
|
130
130
|
for batch_df in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
131
131
|
self.upsert_batch(batch=batch_df)
|
|
132
132
|
|
|
133
|
+
@requires_dependencies(["pandas"], extras="kdbai")
|
|
133
134
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
135
|
+
import pandas as pd
|
|
136
|
+
|
|
134
137
|
df = pd.DataFrame(data=data)
|
|
135
138
|
self.process_dataframe(df=df)
|
|
136
139
|
|
|
@@ -8,7 +8,6 @@ from dataclasses import dataclass, field
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import TYPE_CHECKING, Any, AsyncGenerator, Optional
|
|
10
10
|
|
|
11
|
-
import pandas as pd
|
|
12
11
|
from pydantic import Field
|
|
13
12
|
|
|
14
13
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -26,6 +25,7 @@ CONNECTOR_TYPE = "lancedb"
|
|
|
26
25
|
if TYPE_CHECKING:
|
|
27
26
|
from lancedb import AsyncConnection
|
|
28
27
|
from lancedb.table import AsyncTable
|
|
28
|
+
from pandas import DataFrame
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class LanceDBConnectionConfig(ConnectionConfig, ABC):
|
|
@@ -69,6 +69,7 @@ class LanceDBUploadStager(UploadStager):
|
|
|
69
69
|
default_factory=LanceDBUploadStagerConfig
|
|
70
70
|
)
|
|
71
71
|
|
|
72
|
+
@requires_dependencies(["pandas"], extras="lancedb")
|
|
72
73
|
def run(
|
|
73
74
|
self,
|
|
74
75
|
elements_filepath: Path,
|
|
@@ -77,6 +78,8 @@ class LanceDBUploadStager(UploadStager):
|
|
|
77
78
|
output_filename: str,
|
|
78
79
|
**kwargs: Any,
|
|
79
80
|
) -> Path:
|
|
81
|
+
import pandas as pd
|
|
82
|
+
|
|
80
83
|
with open(elements_filepath) as elements_file:
|
|
81
84
|
elements_contents: list[dict] = json.load(elements_file)
|
|
82
85
|
|
|
@@ -129,7 +132,10 @@ class LanceDBUploader(Uploader):
|
|
|
129
132
|
finally:
|
|
130
133
|
table.close()
|
|
131
134
|
|
|
135
|
+
@requires_dependencies(["pandas"], extras="lancedb")
|
|
132
136
|
async def run_async(self, path, file_data, **kwargs):
|
|
137
|
+
import pandas as pd
|
|
138
|
+
|
|
133
139
|
df = pd.read_feather(path)
|
|
134
140
|
async with self.get_table() as table:
|
|
135
141
|
schema = await table.schema()
|
|
@@ -144,7 +150,9 @@ class LanceDBUploader(Uploader):
|
|
|
144
150
|
await table.delete(f'{RECORD_ID_LABEL} = "{file_data.identifier}"')
|
|
145
151
|
await table.add(data=df)
|
|
146
152
|
|
|
147
|
-
def _fit_to_schema(self, df:
|
|
153
|
+
def _fit_to_schema(self, df: "DataFrame", schema) -> "DataFrame":
|
|
154
|
+
import pandas as pd
|
|
155
|
+
|
|
148
156
|
columns = set(df.columns)
|
|
149
157
|
schema_fields = set(schema.names)
|
|
150
158
|
columns_to_drop = columns - schema_fields
|
|
@@ -3,8 +3,6 @@ from contextlib import contextmanager
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import numpy as np
|
|
7
|
-
import pandas as pd
|
|
8
6
|
from pydantic import Field, Secret
|
|
9
7
|
|
|
10
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
@@ -27,6 +25,7 @@ if TYPE_CHECKING:
|
|
|
27
25
|
from databricks.sdk.core import oauth_service_principal
|
|
28
26
|
from databricks.sql.client import Connection as DeltaTableConnection
|
|
29
27
|
from databricks.sql.client import Cursor as DeltaTableCursor
|
|
28
|
+
from pandas import DataFrame
|
|
30
29
|
|
|
31
30
|
CONNECTOR_TYPE = "databricks_delta_tables"
|
|
32
31
|
|
|
@@ -180,7 +179,10 @@ class DatabricksDeltaTablesUploader(SQLUploader):
|
|
|
180
179
|
)
|
|
181
180
|
return statement
|
|
182
181
|
|
|
183
|
-
|
|
182
|
+
@requires_dependencies(["pandas"], extras="databricks-delta-tables")
|
|
183
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
184
|
+
import numpy as np
|
|
185
|
+
|
|
184
186
|
if self.can_delete():
|
|
185
187
|
self.delete_by_record_id(file_data=file_data)
|
|
186
188
|
else:
|
|
@@ -3,9 +3,9 @@ from contextlib import contextmanager
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from pydantic import Field, Secret
|
|
8
7
|
|
|
8
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
9
9
|
from unstructured_ingest.v2.logger import logger
|
|
10
10
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
11
11
|
DestinationRegistryEntry,
|
|
@@ -46,6 +46,7 @@ class SingleStoreConnectionConfig(SQLConnectionConfig):
|
|
|
46
46
|
database: Optional[str] = Field(default=None, description="SingleStore database")
|
|
47
47
|
|
|
48
48
|
@contextmanager
|
|
49
|
+
@requires_dependencies(["singlestoredb"], extras="singlestore")
|
|
49
50
|
def get_connection(self) -> Generator["SingleStoreConnection", None, None]:
|
|
50
51
|
import singlestoredb as s2
|
|
51
52
|
|
|
@@ -130,9 +131,12 @@ class SingleStoreUploader(SQLUploader):
|
|
|
130
131
|
values_delimiter: str = "%s"
|
|
131
132
|
connector_type: str = CONNECTOR_TYPE
|
|
132
133
|
|
|
134
|
+
@requires_dependencies(["pandas"], extras="singlestore")
|
|
133
135
|
def prepare_data(
|
|
134
136
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
135
137
|
) -> list[tuple[Any, ...]]:
|
|
138
|
+
import pandas as pd
|
|
139
|
+
|
|
136
140
|
output = []
|
|
137
141
|
for row in data:
|
|
138
142
|
parsed = []
|
|
@@ -3,8 +3,6 @@ from contextlib import contextmanager
|
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
5
5
|
|
|
6
|
-
import numpy as np
|
|
7
|
-
import pandas as pd
|
|
8
6
|
from pydantic import Field, Secret
|
|
9
7
|
|
|
10
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
@@ -32,6 +30,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
32
30
|
)
|
|
33
31
|
|
|
34
32
|
if TYPE_CHECKING:
|
|
33
|
+
from pandas import DataFrame
|
|
35
34
|
from snowflake.connector import SnowflakeConnection
|
|
36
35
|
from snowflake.connector.cursor import SnowflakeCursor
|
|
37
36
|
|
|
@@ -174,9 +173,12 @@ class SnowflakeUploader(SQLUploader):
|
|
|
174
173
|
connector_type: str = CONNECTOR_TYPE
|
|
175
174
|
values_delimiter: str = "?"
|
|
176
175
|
|
|
176
|
+
@requires_dependencies(["pandas"], extras="snowflake")
|
|
177
177
|
def prepare_data(
|
|
178
178
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
179
179
|
) -> list[tuple[Any, ...]]:
|
|
180
|
+
import pandas as pd
|
|
181
|
+
|
|
180
182
|
output = []
|
|
181
183
|
for row in data:
|
|
182
184
|
parsed = []
|
|
@@ -210,7 +212,9 @@ class SnowflakeUploader(SQLUploader):
|
|
|
210
212
|
]
|
|
211
213
|
)
|
|
212
214
|
|
|
213
|
-
def upload_dataframe(self, df:
|
|
215
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
216
|
+
import numpy as np
|
|
217
|
+
|
|
214
218
|
if self.can_delete():
|
|
215
219
|
self.delete_by_record_id(file_data=file_data)
|
|
216
220
|
else:
|
|
@@ -6,10 +6,8 @@ from dataclasses import dataclass, field
|
|
|
6
6
|
from datetime import datetime
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from time import time
|
|
9
|
-
from typing import Any, Generator, Union
|
|
9
|
+
from typing import TYPE_CHECKING, Any, Generator, Union
|
|
10
10
|
|
|
11
|
-
import numpy as np
|
|
12
|
-
import pandas as pd
|
|
13
11
|
from dateutil import parser
|
|
14
12
|
from pydantic import BaseModel, Field, Secret
|
|
15
13
|
|
|
@@ -38,6 +36,9 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
38
36
|
from unstructured_ingest.v2.logger import logger
|
|
39
37
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
40
38
|
|
|
39
|
+
if TYPE_CHECKING:
|
|
40
|
+
from pandas import DataFrame
|
|
41
|
+
|
|
41
42
|
_DATE_COLUMNS = ("date_created", "date_modified", "date_processed", "last_modified")
|
|
42
43
|
|
|
43
44
|
|
|
@@ -154,13 +155,15 @@ class SQLDownloader(Downloader, ABC):
|
|
|
154
155
|
def query_db(self, file_data: SqlBatchFileData) -> tuple[list[tuple], list[str]]:
|
|
155
156
|
pass
|
|
156
157
|
|
|
157
|
-
def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list[
|
|
158
|
+
def sql_to_df(self, rows: list[tuple], columns: list[str]) -> list["DataFrame"]:
|
|
159
|
+
import pandas as pd
|
|
160
|
+
|
|
158
161
|
data = [dict(zip(columns, row)) for row in rows]
|
|
159
162
|
df = pd.DataFrame(data)
|
|
160
163
|
dfs = [pd.DataFrame([row.values], columns=df.columns) for index, row in df.iterrows()]
|
|
161
164
|
return dfs
|
|
162
165
|
|
|
163
|
-
def get_data(self, file_data: SqlBatchFileData) -> list[
|
|
166
|
+
def get_data(self, file_data: SqlBatchFileData) -> list["DataFrame"]:
|
|
164
167
|
rows, columns = self.query_db(file_data=file_data)
|
|
165
168
|
return self.sql_to_df(rows=rows, columns=columns)
|
|
166
169
|
|
|
@@ -174,7 +177,7 @@ class SQLDownloader(Downloader, ABC):
|
|
|
174
177
|
return f
|
|
175
178
|
|
|
176
179
|
def generate_download_response(
|
|
177
|
-
self, result:
|
|
180
|
+
self, result: "DataFrame", file_data: SqlBatchFileData
|
|
178
181
|
) -> DownloadResponse:
|
|
179
182
|
id_column = file_data.additional_metadata.id_column
|
|
180
183
|
table_name = file_data.additional_metadata.table_name
|
|
@@ -231,7 +234,7 @@ class SQLUploadStager(UploadStager):
|
|
|
231
234
|
data[RECORD_ID_LABEL] = file_data.identifier
|
|
232
235
|
return data
|
|
233
236
|
|
|
234
|
-
def conform_dataframe(self, df:
|
|
237
|
+
def conform_dataframe(self, df: "DataFrame") -> "DataFrame":
|
|
235
238
|
for column in filter(lambda x: x in df.columns, _DATE_COLUMNS):
|
|
236
239
|
df[column] = df[column].apply(parse_date_string).apply(lambda date: date.timestamp())
|
|
237
240
|
for column in filter(
|
|
@@ -259,6 +262,8 @@ class SQLUploadStager(UploadStager):
|
|
|
259
262
|
output_filename: str,
|
|
260
263
|
**kwargs: Any,
|
|
261
264
|
) -> Path:
|
|
265
|
+
import pandas as pd
|
|
266
|
+
|
|
262
267
|
elements_contents = get_data(path=elements_filepath)
|
|
263
268
|
|
|
264
269
|
df = pd.DataFrame(
|
|
@@ -309,6 +314,8 @@ class SQLUploader(Uploader):
|
|
|
309
314
|
def prepare_data(
|
|
310
315
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
311
316
|
) -> list[tuple[Any, ...]]:
|
|
317
|
+
import pandas as pd
|
|
318
|
+
|
|
312
319
|
output = []
|
|
313
320
|
for row in data:
|
|
314
321
|
parsed = []
|
|
@@ -323,7 +330,9 @@ class SQLUploader(Uploader):
|
|
|
323
330
|
output.append(tuple(parsed))
|
|
324
331
|
return output
|
|
325
332
|
|
|
326
|
-
def _fit_to_schema(self, df:
|
|
333
|
+
def _fit_to_schema(self, df: "DataFrame", add_missing_columns: bool = True) -> "DataFrame":
|
|
334
|
+
import pandas as pd
|
|
335
|
+
|
|
327
336
|
table_columns = self.get_table_columns()
|
|
328
337
|
columns = set(df.columns)
|
|
329
338
|
schema_fields = set(table_columns)
|
|
@@ -348,7 +357,9 @@ class SQLUploader(Uploader):
|
|
|
348
357
|
df[column] = pd.Series()
|
|
349
358
|
return df
|
|
350
359
|
|
|
351
|
-
def upload_dataframe(self, df:
|
|
360
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
361
|
+
import numpy as np
|
|
362
|
+
|
|
352
363
|
if self.can_delete():
|
|
353
364
|
self.delete_by_record_id(file_data=file_data)
|
|
354
365
|
else:
|
|
@@ -409,6 +420,8 @@ class SQLUploader(Uploader):
|
|
|
409
420
|
logger.info(f"deleted {rowcount} rows from table {self.upload_config.table_name}")
|
|
410
421
|
|
|
411
422
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
423
|
+
import pandas as pd
|
|
424
|
+
|
|
412
425
|
df = pd.DataFrame(data)
|
|
413
426
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
414
427
|
|
|
@@ -4,9 +4,9 @@ from dataclasses import dataclass, field
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator
|
|
6
6
|
|
|
7
|
-
import pandas as pd
|
|
8
7
|
from pydantic import Field, Secret, model_validator
|
|
9
8
|
|
|
9
|
+
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
11
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
12
12
|
DestinationRegistryEntry,
|
|
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
|
|
|
32
32
|
from sqlite3 import Connection as SqliteConnection
|
|
33
33
|
from sqlite3 import Cursor as SqliteCursor
|
|
34
34
|
|
|
35
|
+
|
|
35
36
|
CONNECTOR_TYPE = "sqlite"
|
|
36
37
|
|
|
37
38
|
|
|
@@ -132,9 +133,12 @@ class SQLiteUploader(SQLUploader):
|
|
|
132
133
|
connection_config: SQLiteConnectionConfig
|
|
133
134
|
connector_type: str = CONNECTOR_TYPE
|
|
134
135
|
|
|
136
|
+
@requires_dependencies(["pandas"])
|
|
135
137
|
def prepare_data(
|
|
136
138
|
self, columns: list[str], data: tuple[tuple[Any, ...], ...]
|
|
137
139
|
) -> list[tuple[Any, ...]]:
|
|
140
|
+
import pandas as pd
|
|
141
|
+
|
|
138
142
|
output = []
|
|
139
143
|
for row in data:
|
|
140
144
|
parsed = []
|
|
@@ -2,8 +2,6 @@ from contextlib import contextmanager
|
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
4
|
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
5
|
from pydantic import Field, Secret
|
|
8
6
|
|
|
9
7
|
from unstructured_ingest.error import DestinationConnectionError
|
|
@@ -34,6 +32,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
34
32
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
35
33
|
|
|
36
34
|
if TYPE_CHECKING:
|
|
35
|
+
from pandas import DataFrame
|
|
37
36
|
from vastdb import connect as VastdbConnect
|
|
38
37
|
from vastdb import transaction as VastdbTransaction
|
|
39
38
|
from vastdb.table import Table as VastdbTable
|
|
@@ -128,7 +127,6 @@ class VastdbDownloader(SQLDownloader):
|
|
|
128
127
|
ids = tuple([item.identifier for item in file_data.batch_items])
|
|
129
128
|
|
|
130
129
|
with self.connection_config.get_table(table_name) as table:
|
|
131
|
-
|
|
132
130
|
predicate = _[id_column].isin(ids)
|
|
133
131
|
|
|
134
132
|
if self.download_config.fields:
|
|
@@ -168,7 +166,7 @@ class VastdbUploadStager(SQLUploadStager):
|
|
|
168
166
|
data[RECORD_ID_LABEL] = file_data.identifier
|
|
169
167
|
return data
|
|
170
168
|
|
|
171
|
-
def conform_dataframe(self, df:
|
|
169
|
+
def conform_dataframe(self, df: "DataFrame") -> "DataFrame":
|
|
172
170
|
df = super().conform_dataframe(df=df)
|
|
173
171
|
if self.upload_stager_config.rename_columns_map:
|
|
174
172
|
df.rename(columns=self.upload_stager_config.rename_columns_map, inplace=True)
|
|
@@ -193,8 +191,9 @@ class VastdbUploader(SQLUploader):
|
|
|
193
191
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
194
192
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
195
193
|
|
|
196
|
-
@requires_dependencies(["pyarrow"], extras="vastdb")
|
|
197
|
-
def upload_dataframe(self, df:
|
|
194
|
+
@requires_dependencies(["pyarrow", "pandas"], extras="vastdb")
|
|
195
|
+
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
196
|
+
import numpy as np
|
|
198
197
|
import pyarrow as pa
|
|
199
198
|
|
|
200
199
|
if self.can_delete():
|
|
@@ -216,7 +215,6 @@ class VastdbUploader(SQLUploader):
|
|
|
216
215
|
)
|
|
217
216
|
|
|
218
217
|
for rows in split_dataframe(df=df, chunk_size=self.upload_config.batch_size):
|
|
219
|
-
|
|
220
218
|
with self.connection_config.get_table(self.upload_config.table_name) as table:
|
|
221
219
|
pa_table = pa.Table.from_pandas(rows)
|
|
222
220
|
table.insert(pa_table)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.21
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,197 +22,348 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist: python-dateutil
|
|
26
25
|
Requires-Dist: click
|
|
27
|
-
Requires-Dist:
|
|
26
|
+
Requires-Dist: dataclasses_json
|
|
28
27
|
Requires-Dist: pydantic>=2.7
|
|
29
|
-
Requires-Dist:
|
|
28
|
+
Requires-Dist: python-dateutil
|
|
29
|
+
Requires-Dist: opentelemetry-sdk
|
|
30
30
|
Requires-Dist: tqdm
|
|
31
|
-
Requires-Dist:
|
|
31
|
+
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: pandas
|
|
32
33
|
Provides-Extra: remote
|
|
33
34
|
Requires-Dist: unstructured-client>=0.30.0; extra == "remote"
|
|
35
|
+
Requires-Dist: numpy; extra == "remote"
|
|
36
|
+
Requires-Dist: pandas; extra == "remote"
|
|
34
37
|
Provides-Extra: csv
|
|
35
38
|
Requires-Dist: unstructured[tsv]; extra == "csv"
|
|
39
|
+
Requires-Dist: numpy; extra == "csv"
|
|
40
|
+
Requires-Dist: pandas; extra == "csv"
|
|
36
41
|
Provides-Extra: doc
|
|
37
42
|
Requires-Dist: unstructured[docx]; extra == "doc"
|
|
43
|
+
Requires-Dist: numpy; extra == "doc"
|
|
44
|
+
Requires-Dist: pandas; extra == "doc"
|
|
38
45
|
Provides-Extra: docx
|
|
39
46
|
Requires-Dist: unstructured[docx]; extra == "docx"
|
|
47
|
+
Requires-Dist: numpy; extra == "docx"
|
|
48
|
+
Requires-Dist: pandas; extra == "docx"
|
|
40
49
|
Provides-Extra: epub
|
|
41
50
|
Requires-Dist: unstructured[epub]; extra == "epub"
|
|
51
|
+
Requires-Dist: numpy; extra == "epub"
|
|
52
|
+
Requires-Dist: pandas; extra == "epub"
|
|
42
53
|
Provides-Extra: md
|
|
43
54
|
Requires-Dist: unstructured[md]; extra == "md"
|
|
55
|
+
Requires-Dist: numpy; extra == "md"
|
|
56
|
+
Requires-Dist: pandas; extra == "md"
|
|
44
57
|
Provides-Extra: msg
|
|
45
58
|
Requires-Dist: unstructured[msg]; extra == "msg"
|
|
59
|
+
Requires-Dist: numpy; extra == "msg"
|
|
60
|
+
Requires-Dist: pandas; extra == "msg"
|
|
46
61
|
Provides-Extra: odt
|
|
47
62
|
Requires-Dist: unstructured[odt]; extra == "odt"
|
|
63
|
+
Requires-Dist: numpy; extra == "odt"
|
|
64
|
+
Requires-Dist: pandas; extra == "odt"
|
|
48
65
|
Provides-Extra: org
|
|
49
66
|
Requires-Dist: unstructured[org]; extra == "org"
|
|
67
|
+
Requires-Dist: numpy; extra == "org"
|
|
68
|
+
Requires-Dist: pandas; extra == "org"
|
|
50
69
|
Provides-Extra: pdf
|
|
51
70
|
Requires-Dist: unstructured[pdf]; extra == "pdf"
|
|
71
|
+
Requires-Dist: numpy; extra == "pdf"
|
|
72
|
+
Requires-Dist: pandas; extra == "pdf"
|
|
52
73
|
Provides-Extra: ppt
|
|
53
74
|
Requires-Dist: unstructured[pptx]; extra == "ppt"
|
|
75
|
+
Requires-Dist: numpy; extra == "ppt"
|
|
76
|
+
Requires-Dist: pandas; extra == "ppt"
|
|
54
77
|
Provides-Extra: pptx
|
|
55
78
|
Requires-Dist: unstructured[pptx]; extra == "pptx"
|
|
79
|
+
Requires-Dist: numpy; extra == "pptx"
|
|
80
|
+
Requires-Dist: pandas; extra == "pptx"
|
|
56
81
|
Provides-Extra: rtf
|
|
57
82
|
Requires-Dist: unstructured[rtf]; extra == "rtf"
|
|
83
|
+
Requires-Dist: numpy; extra == "rtf"
|
|
84
|
+
Requires-Dist: pandas; extra == "rtf"
|
|
58
85
|
Provides-Extra: rst
|
|
59
86
|
Requires-Dist: unstructured[rst]; extra == "rst"
|
|
87
|
+
Requires-Dist: numpy; extra == "rst"
|
|
88
|
+
Requires-Dist: pandas; extra == "rst"
|
|
60
89
|
Provides-Extra: tsv
|
|
61
90
|
Requires-Dist: unstructured[tsv]; extra == "tsv"
|
|
91
|
+
Requires-Dist: numpy; extra == "tsv"
|
|
92
|
+
Requires-Dist: pandas; extra == "tsv"
|
|
62
93
|
Provides-Extra: xlsx
|
|
63
94
|
Requires-Dist: unstructured[xlsx]; extra == "xlsx"
|
|
95
|
+
Requires-Dist: numpy; extra == "xlsx"
|
|
96
|
+
Requires-Dist: pandas; extra == "xlsx"
|
|
64
97
|
Provides-Extra: airtable
|
|
65
98
|
Requires-Dist: pyairtable; extra == "airtable"
|
|
99
|
+
Requires-Dist: numpy; extra == "airtable"
|
|
100
|
+
Requires-Dist: pandas; extra == "airtable"
|
|
66
101
|
Provides-Extra: astradb
|
|
67
102
|
Requires-Dist: astrapy; extra == "astradb"
|
|
103
|
+
Requires-Dist: numpy; extra == "astradb"
|
|
104
|
+
Requires-Dist: pandas; extra == "astradb"
|
|
68
105
|
Provides-Extra: azure
|
|
69
|
-
Requires-Dist: adlfs; extra == "azure"
|
|
70
106
|
Requires-Dist: fsspec; extra == "azure"
|
|
107
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
108
|
+
Requires-Dist: numpy; extra == "azure"
|
|
109
|
+
Requires-Dist: pandas; extra == "azure"
|
|
71
110
|
Provides-Extra: azure-ai-search
|
|
72
111
|
Requires-Dist: azure-search-documents; extra == "azure-ai-search"
|
|
112
|
+
Requires-Dist: numpy; extra == "azure-ai-search"
|
|
113
|
+
Requires-Dist: pandas; extra == "azure-ai-search"
|
|
73
114
|
Provides-Extra: biomed
|
|
74
115
|
Requires-Dist: bs4; extra == "biomed"
|
|
75
116
|
Requires-Dist: requests; extra == "biomed"
|
|
117
|
+
Requires-Dist: numpy; extra == "biomed"
|
|
118
|
+
Requires-Dist: pandas; extra == "biomed"
|
|
76
119
|
Provides-Extra: box
|
|
77
|
-
Requires-Dist: boxfs; extra == "box"
|
|
78
120
|
Requires-Dist: fsspec; extra == "box"
|
|
121
|
+
Requires-Dist: boxfs; extra == "box"
|
|
122
|
+
Requires-Dist: numpy; extra == "box"
|
|
123
|
+
Requires-Dist: pandas; extra == "box"
|
|
79
124
|
Provides-Extra: chroma
|
|
80
125
|
Requires-Dist: chromadb; extra == "chroma"
|
|
126
|
+
Requires-Dist: numpy; extra == "chroma"
|
|
127
|
+
Requires-Dist: pandas; extra == "chroma"
|
|
81
128
|
Provides-Extra: clarifai
|
|
82
129
|
Requires-Dist: clarifai; extra == "clarifai"
|
|
130
|
+
Requires-Dist: numpy; extra == "clarifai"
|
|
131
|
+
Requires-Dist: pandas; extra == "clarifai"
|
|
83
132
|
Provides-Extra: confluence
|
|
84
|
-
Requires-Dist: requests; extra == "confluence"
|
|
85
133
|
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
134
|
+
Requires-Dist: requests; extra == "confluence"
|
|
135
|
+
Requires-Dist: numpy; extra == "confluence"
|
|
136
|
+
Requires-Dist: pandas; extra == "confluence"
|
|
86
137
|
Provides-Extra: couchbase
|
|
87
138
|
Requires-Dist: couchbase; extra == "couchbase"
|
|
139
|
+
Requires-Dist: numpy; extra == "couchbase"
|
|
140
|
+
Requires-Dist: pandas; extra == "couchbase"
|
|
88
141
|
Provides-Extra: delta-table
|
|
89
142
|
Requires-Dist: boto3; extra == "delta-table"
|
|
90
143
|
Requires-Dist: deltalake; extra == "delta-table"
|
|
144
|
+
Requires-Dist: numpy; extra == "delta-table"
|
|
145
|
+
Requires-Dist: pandas; extra == "delta-table"
|
|
91
146
|
Provides-Extra: discord
|
|
92
147
|
Requires-Dist: discord.py; extra == "discord"
|
|
148
|
+
Requires-Dist: numpy; extra == "discord"
|
|
149
|
+
Requires-Dist: pandas; extra == "discord"
|
|
93
150
|
Provides-Extra: dropbox
|
|
94
151
|
Requires-Dist: fsspec; extra == "dropbox"
|
|
95
152
|
Requires-Dist: dropboxdrivefs; extra == "dropbox"
|
|
153
|
+
Requires-Dist: numpy; extra == "dropbox"
|
|
154
|
+
Requires-Dist: pandas; extra == "dropbox"
|
|
96
155
|
Provides-Extra: duckdb
|
|
97
156
|
Requires-Dist: duckdb; extra == "duckdb"
|
|
157
|
+
Requires-Dist: numpy; extra == "duckdb"
|
|
158
|
+
Requires-Dist: pandas; extra == "duckdb"
|
|
98
159
|
Provides-Extra: elasticsearch
|
|
99
160
|
Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
161
|
+
Requires-Dist: numpy; extra == "elasticsearch"
|
|
162
|
+
Requires-Dist: pandas; extra == "elasticsearch"
|
|
100
163
|
Provides-Extra: gcs
|
|
101
164
|
Requires-Dist: bs4; extra == "gcs"
|
|
102
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
103
165
|
Requires-Dist: fsspec; extra == "gcs"
|
|
166
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
167
|
+
Requires-Dist: numpy; extra == "gcs"
|
|
168
|
+
Requires-Dist: pandas; extra == "gcs"
|
|
104
169
|
Provides-Extra: github
|
|
105
170
|
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
106
171
|
Requires-Dist: requests; extra == "github"
|
|
172
|
+
Requires-Dist: numpy; extra == "github"
|
|
173
|
+
Requires-Dist: pandas; extra == "github"
|
|
107
174
|
Provides-Extra: gitlab
|
|
108
175
|
Requires-Dist: python-gitlab; extra == "gitlab"
|
|
176
|
+
Requires-Dist: numpy; extra == "gitlab"
|
|
177
|
+
Requires-Dist: pandas; extra == "gitlab"
|
|
109
178
|
Provides-Extra: google-drive
|
|
110
179
|
Requires-Dist: google-api-python-client; extra == "google-drive"
|
|
180
|
+
Requires-Dist: numpy; extra == "google-drive"
|
|
181
|
+
Requires-Dist: pandas; extra == "google-drive"
|
|
111
182
|
Provides-Extra: hubspot
|
|
112
|
-
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
113
183
|
Requires-Dist: urllib3; extra == "hubspot"
|
|
184
|
+
Requires-Dist: hubspot-api-client; extra == "hubspot"
|
|
185
|
+
Requires-Dist: numpy; extra == "hubspot"
|
|
186
|
+
Requires-Dist: pandas; extra == "hubspot"
|
|
114
187
|
Provides-Extra: ibm-watsonx-s3
|
|
115
|
-
Requires-Dist:
|
|
188
|
+
Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
|
|
116
189
|
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
190
|
+
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
117
191
|
Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
|
|
118
|
-
Requires-Dist:
|
|
192
|
+
Requires-Dist: numpy; extra == "ibm-watsonx-s3"
|
|
193
|
+
Requires-Dist: pandas; extra == "ibm-watsonx-s3"
|
|
119
194
|
Provides-Extra: jira
|
|
120
195
|
Requires-Dist: atlassian-python-api; extra == "jira"
|
|
196
|
+
Requires-Dist: numpy; extra == "jira"
|
|
197
|
+
Requires-Dist: pandas; extra == "jira"
|
|
121
198
|
Provides-Extra: kafka
|
|
122
199
|
Requires-Dist: confluent-kafka; extra == "kafka"
|
|
200
|
+
Requires-Dist: numpy; extra == "kafka"
|
|
201
|
+
Requires-Dist: pandas; extra == "kafka"
|
|
123
202
|
Provides-Extra: kdbai
|
|
124
203
|
Requires-Dist: kdbai-client>=1.4.0; extra == "kdbai"
|
|
204
|
+
Requires-Dist: numpy; extra == "kdbai"
|
|
205
|
+
Requires-Dist: pandas; extra == "kdbai"
|
|
125
206
|
Provides-Extra: lancedb
|
|
126
207
|
Requires-Dist: lancedb; extra == "lancedb"
|
|
208
|
+
Requires-Dist: numpy; extra == "lancedb"
|
|
209
|
+
Requires-Dist: pandas; extra == "lancedb"
|
|
127
210
|
Provides-Extra: milvus
|
|
128
211
|
Requires-Dist: pymilvus; extra == "milvus"
|
|
212
|
+
Requires-Dist: numpy; extra == "milvus"
|
|
213
|
+
Requires-Dist: pandas; extra == "milvus"
|
|
129
214
|
Provides-Extra: mongodb
|
|
130
215
|
Requires-Dist: pymongo; extra == "mongodb"
|
|
216
|
+
Requires-Dist: numpy; extra == "mongodb"
|
|
217
|
+
Requires-Dist: pandas; extra == "mongodb"
|
|
131
218
|
Provides-Extra: neo4j
|
|
132
219
|
Requires-Dist: networkx; extra == "neo4j"
|
|
133
|
-
Requires-Dist: cymple; extra == "neo4j"
|
|
134
220
|
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
221
|
+
Requires-Dist: cymple; extra == "neo4j"
|
|
222
|
+
Requires-Dist: numpy; extra == "neo4j"
|
|
223
|
+
Requires-Dist: pandas; extra == "neo4j"
|
|
135
224
|
Provides-Extra: notion
|
|
225
|
+
Requires-Dist: httpx; extra == "notion"
|
|
136
226
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
137
227
|
Requires-Dist: notion-client; extra == "notion"
|
|
138
|
-
Requires-Dist: httpx; extra == "notion"
|
|
139
228
|
Requires-Dist: backoff; extra == "notion"
|
|
229
|
+
Requires-Dist: numpy; extra == "notion"
|
|
230
|
+
Requires-Dist: pandas; extra == "notion"
|
|
140
231
|
Provides-Extra: onedrive
|
|
141
232
|
Requires-Dist: bs4; extra == "onedrive"
|
|
142
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
143
233
|
Requires-Dist: msal; extra == "onedrive"
|
|
234
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
235
|
+
Requires-Dist: numpy; extra == "onedrive"
|
|
236
|
+
Requires-Dist: pandas; extra == "onedrive"
|
|
144
237
|
Provides-Extra: opensearch
|
|
145
238
|
Requires-Dist: opensearch-py; extra == "opensearch"
|
|
239
|
+
Requires-Dist: numpy; extra == "opensearch"
|
|
240
|
+
Requires-Dist: pandas; extra == "opensearch"
|
|
146
241
|
Provides-Extra: outlook
|
|
147
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
148
242
|
Requires-Dist: msal; extra == "outlook"
|
|
243
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
244
|
+
Requires-Dist: numpy; extra == "outlook"
|
|
245
|
+
Requires-Dist: pandas; extra == "outlook"
|
|
149
246
|
Provides-Extra: pinecone
|
|
150
247
|
Requires-Dist: pinecone-client>=3.7.1; extra == "pinecone"
|
|
248
|
+
Requires-Dist: numpy; extra == "pinecone"
|
|
249
|
+
Requires-Dist: pandas; extra == "pinecone"
|
|
151
250
|
Provides-Extra: postgres
|
|
152
251
|
Requires-Dist: psycopg2-binary; extra == "postgres"
|
|
252
|
+
Requires-Dist: numpy; extra == "postgres"
|
|
253
|
+
Requires-Dist: pandas; extra == "postgres"
|
|
153
254
|
Provides-Extra: qdrant
|
|
154
255
|
Requires-Dist: qdrant-client; extra == "qdrant"
|
|
256
|
+
Requires-Dist: numpy; extra == "qdrant"
|
|
257
|
+
Requires-Dist: pandas; extra == "qdrant"
|
|
155
258
|
Provides-Extra: reddit
|
|
156
259
|
Requires-Dist: praw; extra == "reddit"
|
|
260
|
+
Requires-Dist: numpy; extra == "reddit"
|
|
261
|
+
Requires-Dist: pandas; extra == "reddit"
|
|
157
262
|
Provides-Extra: redis
|
|
158
263
|
Requires-Dist: redis; extra == "redis"
|
|
264
|
+
Requires-Dist: numpy; extra == "redis"
|
|
265
|
+
Requires-Dist: pandas; extra == "redis"
|
|
159
266
|
Provides-Extra: s3
|
|
160
|
-
Requires-Dist: s3fs; extra == "s3"
|
|
161
267
|
Requires-Dist: fsspec; extra == "s3"
|
|
268
|
+
Requires-Dist: s3fs; extra == "s3"
|
|
269
|
+
Requires-Dist: numpy; extra == "s3"
|
|
270
|
+
Requires-Dist: pandas; extra == "s3"
|
|
162
271
|
Provides-Extra: sharepoint
|
|
163
|
-
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
164
272
|
Requires-Dist: msal; extra == "sharepoint"
|
|
273
|
+
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
274
|
+
Requires-Dist: numpy; extra == "sharepoint"
|
|
275
|
+
Requires-Dist: pandas; extra == "sharepoint"
|
|
165
276
|
Provides-Extra: salesforce
|
|
166
277
|
Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
278
|
+
Requires-Dist: numpy; extra == "salesforce"
|
|
279
|
+
Requires-Dist: pandas; extra == "salesforce"
|
|
167
280
|
Provides-Extra: sftp
|
|
168
|
-
Requires-Dist: paramiko; extra == "sftp"
|
|
169
281
|
Requires-Dist: fsspec; extra == "sftp"
|
|
282
|
+
Requires-Dist: paramiko; extra == "sftp"
|
|
283
|
+
Requires-Dist: numpy; extra == "sftp"
|
|
284
|
+
Requires-Dist: pandas; extra == "sftp"
|
|
170
285
|
Provides-Extra: slack
|
|
171
286
|
Requires-Dist: slack_sdk[optional]; extra == "slack"
|
|
287
|
+
Requires-Dist: numpy; extra == "slack"
|
|
288
|
+
Requires-Dist: pandas; extra == "slack"
|
|
172
289
|
Provides-Extra: snowflake
|
|
173
|
-
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
174
290
|
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
291
|
+
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
292
|
+
Requires-Dist: numpy; extra == "snowflake"
|
|
293
|
+
Requires-Dist: pandas; extra == "snowflake"
|
|
175
294
|
Provides-Extra: wikipedia
|
|
176
295
|
Requires-Dist: wikipedia; extra == "wikipedia"
|
|
296
|
+
Requires-Dist: numpy; extra == "wikipedia"
|
|
297
|
+
Requires-Dist: pandas; extra == "wikipedia"
|
|
177
298
|
Provides-Extra: weaviate
|
|
178
299
|
Requires-Dist: weaviate-client; extra == "weaviate"
|
|
300
|
+
Requires-Dist: numpy; extra == "weaviate"
|
|
301
|
+
Requires-Dist: pandas; extra == "weaviate"
|
|
179
302
|
Provides-Extra: databricks-volumes
|
|
180
303
|
Requires-Dist: databricks-sdk; extra == "databricks-volumes"
|
|
304
|
+
Requires-Dist: numpy; extra == "databricks-volumes"
|
|
305
|
+
Requires-Dist: pandas; extra == "databricks-volumes"
|
|
181
306
|
Provides-Extra: databricks-delta-tables
|
|
182
307
|
Requires-Dist: databricks-sql-connector; extra == "databricks-delta-tables"
|
|
308
|
+
Requires-Dist: numpy; extra == "databricks-delta-tables"
|
|
309
|
+
Requires-Dist: pandas; extra == "databricks-delta-tables"
|
|
183
310
|
Provides-Extra: singlestore
|
|
184
311
|
Requires-Dist: singlestoredb; extra == "singlestore"
|
|
312
|
+
Requires-Dist: numpy; extra == "singlestore"
|
|
313
|
+
Requires-Dist: pandas; extra == "singlestore"
|
|
185
314
|
Provides-Extra: vectara
|
|
186
315
|
Requires-Dist: httpx; extra == "vectara"
|
|
187
316
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
188
317
|
Requires-Dist: requests; extra == "vectara"
|
|
318
|
+
Requires-Dist: numpy; extra == "vectara"
|
|
319
|
+
Requires-Dist: pandas; extra == "vectara"
|
|
189
320
|
Provides-Extra: vastdb
|
|
321
|
+
Requires-Dist: ibis; extra == "vastdb"
|
|
190
322
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
191
323
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
192
|
-
Requires-Dist:
|
|
324
|
+
Requires-Dist: numpy; extra == "vastdb"
|
|
325
|
+
Requires-Dist: pandas; extra == "vastdb"
|
|
193
326
|
Provides-Extra: zendesk
|
|
194
327
|
Requires-Dist: bs4; extra == "zendesk"
|
|
195
|
-
Requires-Dist: aiofiles; extra == "zendesk"
|
|
196
328
|
Requires-Dist: httpx; extra == "zendesk"
|
|
329
|
+
Requires-Dist: aiofiles; extra == "zendesk"
|
|
330
|
+
Requires-Dist: numpy; extra == "zendesk"
|
|
331
|
+
Requires-Dist: pandas; extra == "zendesk"
|
|
197
332
|
Provides-Extra: embed-huggingface
|
|
198
333
|
Requires-Dist: sentence-transformers; extra == "embed-huggingface"
|
|
334
|
+
Requires-Dist: numpy; extra == "embed-huggingface"
|
|
335
|
+
Requires-Dist: pandas; extra == "embed-huggingface"
|
|
199
336
|
Provides-Extra: embed-octoai
|
|
200
|
-
Requires-Dist: openai; extra == "embed-octoai"
|
|
201
337
|
Requires-Dist: tiktoken; extra == "embed-octoai"
|
|
338
|
+
Requires-Dist: openai; extra == "embed-octoai"
|
|
339
|
+
Requires-Dist: numpy; extra == "embed-octoai"
|
|
340
|
+
Requires-Dist: pandas; extra == "embed-octoai"
|
|
202
341
|
Provides-Extra: embed-vertexai
|
|
203
342
|
Requires-Dist: vertexai; extra == "embed-vertexai"
|
|
343
|
+
Requires-Dist: numpy; extra == "embed-vertexai"
|
|
344
|
+
Requires-Dist: pandas; extra == "embed-vertexai"
|
|
204
345
|
Provides-Extra: embed-voyageai
|
|
205
346
|
Requires-Dist: voyageai; extra == "embed-voyageai"
|
|
347
|
+
Requires-Dist: numpy; extra == "embed-voyageai"
|
|
348
|
+
Requires-Dist: pandas; extra == "embed-voyageai"
|
|
206
349
|
Provides-Extra: embed-mixedbreadai
|
|
207
350
|
Requires-Dist: mixedbread-ai; extra == "embed-mixedbreadai"
|
|
351
|
+
Requires-Dist: numpy; extra == "embed-mixedbreadai"
|
|
352
|
+
Requires-Dist: pandas; extra == "embed-mixedbreadai"
|
|
208
353
|
Provides-Extra: openai
|
|
209
|
-
Requires-Dist: openai; extra == "openai"
|
|
210
354
|
Requires-Dist: tiktoken; extra == "openai"
|
|
355
|
+
Requires-Dist: openai; extra == "openai"
|
|
356
|
+
Requires-Dist: numpy; extra == "openai"
|
|
357
|
+
Requires-Dist: pandas; extra == "openai"
|
|
211
358
|
Provides-Extra: bedrock
|
|
212
359
|
Requires-Dist: aioboto3; extra == "bedrock"
|
|
213
360
|
Requires-Dist: boto3; extra == "bedrock"
|
|
361
|
+
Requires-Dist: numpy; extra == "bedrock"
|
|
362
|
+
Requires-Dist: pandas; extra == "bedrock"
|
|
214
363
|
Provides-Extra: togetherai
|
|
215
364
|
Requires-Dist: together; extra == "togetherai"
|
|
365
|
+
Requires-Dist: numpy; extra == "togetherai"
|
|
366
|
+
Requires-Dist: pandas; extra == "togetherai"
|
|
216
367
|
Dynamic: author
|
|
217
368
|
Dynamic: author-email
|
|
218
369
|
Dynamic: classifier
|
|
@@ -5,7 +5,7 @@ test/integration/chunkers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
5
5
|
test/integration/chunkers/test_chunkers.py,sha256=USkltQN_mVVCxI0FkJsrS1gnLXlVr-fvsc0tPaK2sWI,1062
|
|
6
6
|
test/integration/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
test/integration/connectors/conftest.py,sha256=vYs4WDlCuieAwwErkJxCk4a1lGvr3qpeiAm-YaDznSo,1018
|
|
8
|
-
test/integration/connectors/test_astradb.py,sha256=
|
|
8
|
+
test/integration/connectors/test_astradb.py,sha256=hQyxvnbvN1UN-oDOBkXyniAs6GLb0rstQOoLT4LcBNI,9921
|
|
9
9
|
test/integration/connectors/test_azure_ai_search.py,sha256=MxFwk84vI_HT4taQTGrNpJ8ewGPqHSGrx626j8hC_Pw,9695
|
|
10
10
|
test/integration/connectors/test_chroma.py,sha256=1uGHbZXkXKGb8wl3p7c9G-L1MViUe283Hw5u3dg8OgI,4532
|
|
11
11
|
test/integration/connectors/test_confluence.py,sha256=W93znOusdvFXta8q0dqQ1rKhLafRVIqrfaFqk2FY-fo,3590
|
|
@@ -113,7 +113,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
113
113
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
114
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
115
115
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
116
|
-
unstructured_ingest/__version__.py,sha256=
|
|
116
|
+
unstructured_ingest/__version__.py,sha256=b5BrQJjlBZoPiM_J1cJDbJABGvcwaDFb_Bvwb0AHN10,43
|
|
117
117
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
118
118
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
119
119
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -284,7 +284,7 @@ unstructured_ingest/embed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
284
284
|
unstructured_ingest/embed/azure_openai.py,sha256=_-I-nwd-wdCiKkSdYBL4UKrTZ2UPWsM_0T69fcObs_I,1707
|
|
285
285
|
unstructured_ingest/embed/bedrock.py,sha256=tZumLLXafSr1zIFVjckapRoiiY-7u65GPuWmwsdhY0I,7726
|
|
286
286
|
unstructured_ingest/embed/huggingface.py,sha256=-ZD17O_H_UnK80fqig6y6wNKJckjx0HuAkY5vgPvk8M,2259
|
|
287
|
-
unstructured_ingest/embed/interfaces.py,sha256=
|
|
287
|
+
unstructured_ingest/embed/interfaces.py,sha256=SdB3t8eMPB8CbXzOYBpgwjzTvyb4T19L61Sr6Jy3_rw,5099
|
|
288
288
|
unstructured_ingest/embed/mixedbreadai.py,sha256=-Y0J27G9CL1t3ZTIeNjTjRviErSMAzJRf2zgDgMHUmg,4499
|
|
289
289
|
unstructured_ingest/embed/octoai.py,sha256=hNLEskDEP-2qWExUgVz2Eyw3KTIFwdUE9elbJ5qp4Ao,3855
|
|
290
290
|
unstructured_ingest/embed/openai.py,sha256=EindGUouvP8wolOBNbWQhAkaI6WGyPN4Hh2xyKuR6L8,3372
|
|
@@ -372,13 +372,13 @@ unstructured_ingest/runner/writers/fsspec/s3.py,sha256=kHJq2O3864QBd_tL2SKb0mdyw
|
|
|
372
372
|
unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
373
373
|
unstructured_ingest/utils/chunking.py,sha256=9b3sXMA6L8RW5xAkKQbwdtVudGLAcj_sgT6Grh5tyYM,1870
|
|
374
374
|
unstructured_ingest/utils/compression.py,sha256=NNiY-2S2Gf3at7zC1PYxMijaEza9vVSzRn5mdFf6mHo,4434
|
|
375
|
-
unstructured_ingest/utils/data_prep.py,sha256
|
|
375
|
+
unstructured_ingest/utils/data_prep.py,sha256=-hhGbWm1Sev57t9z20JJLW0vS6kdhArCbb_xmIlKGaY,7826
|
|
376
376
|
unstructured_ingest/utils/dep_check.py,sha256=SXXcUna2H0RtxA6j1S2NGkvQa9JP2DujWhmyBa7776Y,2400
|
|
377
377
|
unstructured_ingest/utils/google_filetype.py,sha256=YVspEkiiBrRUSGVeVbsavvLvTmizdy2e6TsjigXTSRU,468
|
|
378
378
|
unstructured_ingest/utils/html.py,sha256=DGRDMqGbwH8RiF94Qh6NiqVkbbjZfe1h26dIehC-X7M,6340
|
|
379
379
|
unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01qAbElH0,1201
|
|
380
380
|
unstructured_ingest/utils/string_and_date_utils.py,sha256=54tzuqmhPN0uWnPLrzAWAsDGU9s6mQE_KSVywMDwTBk,2522
|
|
381
|
-
unstructured_ingest/utils/table.py,sha256=
|
|
381
|
+
unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
|
|
382
382
|
unstructured_ingest/v2/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
383
383
|
unstructured_ingest/v2/constants.py,sha256=pDspTYz-nEojHBqrZNfssGEiujmVa02pIWL63PQP9sU,103
|
|
384
384
|
unstructured_ingest/v2/errors.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
|
|
@@ -435,12 +435,12 @@ unstructured_ingest/v2/processes/connectors/azure_ai_search.py,sha256=ngPDpU0oZ6
|
|
|
435
435
|
unstructured_ingest/v2/processes/connectors/chroma.py,sha256=VHCnM56qNXuHzovJihrNfJnZbWLJShOe8j12PJFrbL0,7219
|
|
436
436
|
unstructured_ingest/v2/processes/connectors/confluence.py,sha256=gSs4-AxL0gfeWdJfP7JfCrQSQNLoJRkvHquKK9RJvpQ,12043
|
|
437
437
|
unstructured_ingest/v2/processes/connectors/couchbase.py,sha256=i7vuNKsUkN93JRVmg4--MO0ZgbjvhIqt46oYqk9zFSQ,12250
|
|
438
|
-
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=
|
|
438
|
+
unstructured_ingest/v2/processes/connectors/delta_table.py,sha256=Jx2EUqchJDqfPsyw4Ks-HaLSq2rIwXc1l1YFqjh_BbM,7240
|
|
439
439
|
unstructured_ingest/v2/processes/connectors/discord.py,sha256=-e4-cBK4TnHkknK1qIb86AIVMy81lBgC288_iLpTzM8,5246
|
|
440
440
|
unstructured_ingest/v2/processes/connectors/gitlab.py,sha256=ufE65Z8q_tC4oppGg5BsGXwSaL7RbEXcaagJQYsylNo,9984
|
|
441
441
|
unstructured_ingest/v2/processes/connectors/google_drive.py,sha256=QzcHNelUbnubsDtanFIgDCRzmYTuP-GjJ_g9y8fButE,19623
|
|
442
442
|
unstructured_ingest/v2/processes/connectors/jira.py,sha256=-f_vIWNw6Xr8rMNdAcfCC2cmhB-QndnZk5XymHo60FU,17094
|
|
443
|
-
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=
|
|
443
|
+
unstructured_ingest/v2/processes/connectors/kdbai.py,sha256=1dXfNb3qaV669-_BjCQdznmfuWLPGjmdkv2ybmkAHjQ,5099
|
|
444
444
|
unstructured_ingest/v2/processes/connectors/local.py,sha256=FWPRjjUsnQjyZMChuZGuMU04AB5X0sFEOcAXhx1r9sk,7381
|
|
445
445
|
unstructured_ingest/v2/processes/connectors/milvus.py,sha256=wmcu9NVy3gYlQGT25inN5w_QrhFoL8-hRq0pJFSNw8g,8866
|
|
446
446
|
unstructured_ingest/v2/processes/connectors/mongodb.py,sha256=cL0QUQZF_s2brh3nNNeAywXVpaIiND4b5JTAFlYjLjw,14273
|
|
@@ -464,9 +464,9 @@ unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss
|
|
|
464
464
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
|
|
465
465
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=FZhjrMYBr_je6mWYp7MUUvyKR9YwGD2HiNljeT7U5ws,5044
|
|
466
466
|
unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
|
|
467
|
-
unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=
|
|
468
|
-
unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=
|
|
469
|
-
unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=
|
|
467
|
+
unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=o3J81DnSwt3lmAh19jXVPAYRZLJ3VyGhaEVO2SIjksQ,2926
|
|
468
|
+
unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=NIo2CCiPiuTFotNC891Mbelzg01knItryYGUtOM96xg,4393
|
|
469
|
+
unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py,sha256=RW-Cw94Hs3ZsN8Kb4ciSh_N-Qkp0cqkw_xkJbt8CDNU,4656
|
|
470
470
|
unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py,sha256=Zzc0JNPP-eFqpwWw1Gp-XC8H-s__IgkYKzoagECycZY,829
|
|
471
471
|
unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py,sha256=MEKU64OsiQmbLPb3ken-WWCIV6-pnFbs_6kjJweG-SY,18813
|
|
472
472
|
unstructured_ingest/v2/processes/connectors/elasticsearch/opensearch.py,sha256=qRz8Fyr2RSZIPZGkhPeme6AZxM0aX-c_xOa1ZtSr2Kg,6781
|
|
@@ -490,7 +490,7 @@ unstructured_ingest/v2/processes/connectors/lancedb/aws.py,sha256=eeXWsh8UeVm1Ur
|
|
|
490
490
|
unstructured_ingest/v2/processes/connectors/lancedb/azure.py,sha256=Ms5vQVRIpTF1Q2qBl_bET9wbgaf4diPaH-iR8kJlr4E,1461
|
|
491
491
|
unstructured_ingest/v2/processes/connectors/lancedb/cloud.py,sha256=BFy0gW2OZ_qaZJM97m-tNsFaJPi9zOKrrd2y4thcNP0,1341
|
|
492
492
|
unstructured_ingest/v2/processes/connectors/lancedb/gcp.py,sha256=p5BPaFtS3y3Yh8PIr3tUqsAXrUYu4QYYAWQNh5W2ucE,1361
|
|
493
|
-
unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=
|
|
493
|
+
unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py,sha256=Y4waCOrtlz5Eyf3Me6rInzt_Ory0woseLe_hfSD1nDM,5926
|
|
494
494
|
unstructured_ingest/v2/processes/connectors/lancedb/local.py,sha256=_7-6iO6B60gAWwJUUrmlsRzYMFIBeZgu_QT3mhw5L0I,1272
|
|
495
495
|
unstructured_ingest/v2/processes/connectors/notion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
496
496
|
unstructured_ingest/v2/processes/connectors/notion/client.py,sha256=8_K6x1Z4bkvSer1NicQeqpX8Y275OUS65kfqTWRU09g,13120
|
|
@@ -564,13 +564,13 @@ unstructured_ingest/v2/processes/connectors/qdrant/local.py,sha256=cGEyv3Oy6y4BQ
|
|
|
564
564
|
unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py,sha256=BHI7HYSdbS05j2vrjyDvLzVG1WfsM8osKeq-lttlybQ,5437
|
|
565
565
|
unstructured_ingest/v2/processes/connectors/qdrant/server.py,sha256=odvCZWZp8DmRxLXMR7tHhW-c7UQbix1_zpFdfXfCvKI,1613
|
|
566
566
|
unstructured_ingest/v2/processes/connectors/sql/__init__.py,sha256=NSEZwJDHh_9kFc31LnG14iRtYF3meK2UfUlQfYnwYEQ,2059
|
|
567
|
-
unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha256=
|
|
567
|
+
unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py,sha256=Ys-pRLiYtdvNRdDnWYwhMqteLQPekRFHrqsrr9jQVpo,9049
|
|
568
568
|
unstructured_ingest/v2/processes/connectors/sql/postgres.py,sha256=BATfX1PQGT2kl8jAbdNKXTojYKJxh3pJV9-h3OBnHGo,5124
|
|
569
|
-
unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=
|
|
570
|
-
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=
|
|
571
|
-
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=
|
|
572
|
-
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=
|
|
573
|
-
unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=
|
|
569
|
+
unstructured_ingest/v2/processes/connectors/sql/singlestore.py,sha256=am2d87kDkpTTB0VbPSX3ce9o6oM9KUQu5y9T_p1kgJw,5711
|
|
570
|
+
unstructured_ingest/v2/processes/connectors/sql/snowflake.py,sha256=r2qgoEF3bUugzgSr3hMJyIm8DKmxsO53ZHXJSNxOsvE,9379
|
|
571
|
+
unstructured_ingest/v2/processes/connectors/sql/sql.py,sha256=G28VUR0zaMVmQtbdZG6TRpkWFHvXJqFrr7SBuyM-fME,15608
|
|
572
|
+
unstructured_ingest/v2/processes/connectors/sql/sqlite.py,sha256=6RoBUxMbeuhduvTFlBKMgEH1NKJg7doQjXF_R5cUuX0,5319
|
|
573
|
+
unstructured_ingest/v2/processes/connectors/sql/vastdb.py,sha256=wklJ8p3eMb81FTjS6ukPoILuWN0_KQBfuYGXfE0XrqY,9644
|
|
574
574
|
unstructured_ingest/v2/processes/connectors/weaviate/__init__.py,sha256=NMiwnVWan69KnzVELvaqX34tMhCytIa-C8EDsXVKsEo,856
|
|
575
575
|
unstructured_ingest/v2/processes/connectors/weaviate/cloud.py,sha256=bXtfEYLquR-BszZ5S_lQ4JbETNs9Vozgpfm8x9egAmE,6251
|
|
576
576
|
unstructured_ingest/v2/processes/connectors/weaviate/embedded.py,sha256=S8Zg8StuZT-k7tCg1D5YShO1-vJYYk9-M1bE1fIqx64,3014
|
|
@@ -581,9 +581,9 @@ unstructured_ingest/v2/processes/connectors/zendesk/client.py,sha256=DDAYQB7catK
|
|
|
581
581
|
unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py,sha256=R8SXYkRhVUoWEHdGCt2CzcTxxuFundw_0GlGZ34YmbM,8987
|
|
582
582
|
unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
583
583
|
unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=EWvK4HRYubr9i1UyMhv5cU9u0UzVkCDC_BIm4Uxab7Y,964
|
|
584
|
-
unstructured_ingest-0.5.
|
|
585
|
-
unstructured_ingest-0.5.
|
|
586
|
-
unstructured_ingest-0.5.
|
|
587
|
-
unstructured_ingest-0.5.
|
|
588
|
-
unstructured_ingest-0.5.
|
|
589
|
-
unstructured_ingest-0.5.
|
|
584
|
+
unstructured_ingest-0.5.21.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
585
|
+
unstructured_ingest-0.5.21.dist-info/METADATA,sha256=c1bUHvgG6X9QOiAD669sVHAFkGfI2tBTRBM-eRJBLiU,14999
|
|
586
|
+
unstructured_ingest-0.5.21.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
587
|
+
unstructured_ingest-0.5.21.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
588
|
+
unstructured_ingest-0.5.21.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
589
|
+
unstructured_ingest-0.5.21.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.5.20.dist-info → unstructured_ingest-0.5.21.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|