unstructured-ingest 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/processes/chunker.py +5 -2
- unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +8 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +93 -59
- {unstructured_ingest-0.6.0.dist-info → unstructured_ingest-0.6.2.dist-info}/METADATA +23 -23
- {unstructured_ingest-0.6.0.dist-info → unstructured_ingest-0.6.2.dist-info}/RECORD +11 -12
- test/unit/v2/connectors/databricks/__init__.py +0 -0
- test/unit/v2/connectors/databricks/test_volumes_table.py +0 -44
- {unstructured_ingest-0.6.0.dist-info → unstructured_ingest-0.6.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.6.0.dist-info → unstructured_ingest-0.6.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.6.0.dist-info → unstructured_ingest-0.6.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.6.0.dist-info → unstructured_ingest-0.6.2.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.6.
|
|
1
|
+
__version__ = "0.6.2" # pragma: no cover
|
|
@@ -6,6 +6,7 @@ from typing import Any, Optional
|
|
|
6
6
|
from pydantic import BaseModel, Field, SecretStr
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.utils.chunking import assign_and_map_hash_ids
|
|
9
|
+
from unstructured_ingest.utils.data_prep import get_json_data
|
|
9
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
11
12
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -92,9 +93,11 @@ class Chunker(BaseProcess, ABC):
|
|
|
92
93
|
@requires_dependencies(dependencies=["unstructured"])
|
|
93
94
|
def run(self, elements_filepath: Path, **kwargs: Any) -> list[dict]:
|
|
94
95
|
from unstructured.chunking import dispatch
|
|
95
|
-
from unstructured.staging.base import
|
|
96
|
+
from unstructured.staging.base import elements_from_dicts
|
|
96
97
|
|
|
97
|
-
|
|
98
|
+
element_dicts = get_json_data(elements_filepath)
|
|
99
|
+
|
|
100
|
+
elements = elements_from_dicts(element_dicts=element_dicts)
|
|
98
101
|
if not elements:
|
|
99
102
|
return [e.to_dict() for e in elements]
|
|
100
103
|
local_chunking_strategies = ("basic", "by_title")
|
|
@@ -5,7 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
6
6
|
from uuid import NAMESPACE_DNS, uuid5
|
|
7
7
|
|
|
8
|
-
from pydantic import BaseModel, Field
|
|
8
|
+
from pydantic import BaseModel, Field, Secret
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
11
11
|
from unstructured_ingest.v2.errors import (
|
|
@@ -61,12 +61,14 @@ class DatabricksVolumesAccessConfig(AccessConfig):
|
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
|
|
64
|
+
access_config: Secret[DatabricksVolumesAccessConfig]
|
|
64
65
|
host: Optional[str] = Field(
|
|
65
66
|
default=None,
|
|
66
67
|
description="The Databricks host URL for either the "
|
|
67
68
|
"Databricks workspace endpoint or the "
|
|
68
69
|
"Databricks accounts endpoint.",
|
|
69
70
|
)
|
|
71
|
+
user_agent: str = "unstructuredio_oss"
|
|
70
72
|
|
|
71
73
|
def wrap_error(self, e: Exception) -> Exception:
|
|
72
74
|
from databricks.sdk.errors.base import DatabricksError
|
|
@@ -94,11 +96,14 @@ class DatabricksVolumesConnectionConfig(ConnectionConfig, ABC):
|
|
|
94
96
|
@requires_dependencies(dependencies=["databricks.sdk"], extras="databricks-volumes")
|
|
95
97
|
def get_client(self) -> "WorkspaceClient":
|
|
96
98
|
from databricks.sdk import WorkspaceClient
|
|
99
|
+
from databricks.sdk.core import Config
|
|
97
100
|
|
|
98
|
-
|
|
101
|
+
config = Config(
|
|
99
102
|
host=self.host,
|
|
100
103
|
**self.access_config.get_secret_value().model_dump(),
|
|
101
|
-
)
|
|
104
|
+
).with_user_agent_extra("PyDatabricksSdk", self.user_agent)
|
|
105
|
+
|
|
106
|
+
return WorkspaceClient(config=config)
|
|
102
107
|
|
|
103
108
|
|
|
104
109
|
class DatabricksVolumesIndexerConfig(IndexerConfig, DatabricksPathMixin):
|
|
@@ -1,14 +1,20 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import os
|
|
2
|
-
import tempfile
|
|
3
3
|
from contextlib import contextmanager
|
|
4
|
-
from dataclasses import dataclass
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Generator, Optional
|
|
7
7
|
|
|
8
8
|
from pydantic import Field
|
|
9
9
|
|
|
10
|
-
from unstructured_ingest.utils.data_prep import
|
|
11
|
-
from unstructured_ingest.v2.
|
|
10
|
+
from unstructured_ingest.utils.data_prep import get_json_data, write_data
|
|
11
|
+
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
12
|
+
from unstructured_ingest.v2.interfaces import (
|
|
13
|
+
Uploader,
|
|
14
|
+
UploaderConfig,
|
|
15
|
+
UploadStager,
|
|
16
|
+
UploadStagerConfig,
|
|
17
|
+
)
|
|
12
18
|
from unstructured_ingest.v2.logger import logger
|
|
13
19
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
14
20
|
DestinationRegistryEntry,
|
|
@@ -16,28 +22,50 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
16
22
|
from unstructured_ingest.v2.processes.connectors.databricks.volumes import DatabricksPathMixin
|
|
17
23
|
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
|
|
18
24
|
DatabricksDeltaTablesConnectionConfig,
|
|
19
|
-
DatabricksDeltaTablesUploadStager,
|
|
20
25
|
DatabricksDeltaTablesUploadStagerConfig,
|
|
21
26
|
)
|
|
22
27
|
from unstructured_ingest.v2.types.file_data import FileData
|
|
28
|
+
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
23
29
|
|
|
24
30
|
CONNECTOR_TYPE = "databricks_volume_delta_tables"
|
|
25
31
|
|
|
26
32
|
if TYPE_CHECKING:
|
|
27
|
-
|
|
33
|
+
pass
|
|
28
34
|
|
|
29
35
|
|
|
30
36
|
class DatabricksVolumeDeltaTableUploaderConfig(UploaderConfig, DatabricksPathMixin):
|
|
31
37
|
database: str = Field(description="Database name", default="default")
|
|
32
|
-
table_name: str = Field(description="Table name")
|
|
38
|
+
table_name: Optional[str] = Field(description="Table name", default=None)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DatabricksVolumeDeltaTableStagerConfig(UploadStagerConfig):
|
|
42
|
+
pass
|
|
33
43
|
|
|
34
44
|
|
|
35
45
|
@dataclass
|
|
36
|
-
class DatabricksVolumeDeltaTableStager(
|
|
37
|
-
|
|
46
|
+
class DatabricksVolumeDeltaTableStager(UploadStager):
|
|
47
|
+
upload_stager_config: DatabricksVolumeDeltaTableStagerConfig = field(
|
|
48
|
+
default_factory=DatabricksVolumeDeltaTableStagerConfig
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def run(
|
|
52
|
+
self,
|
|
53
|
+
elements_filepath: Path,
|
|
54
|
+
output_dir: Path,
|
|
55
|
+
output_filename: str,
|
|
56
|
+
file_data: FileData,
|
|
57
|
+
**kwargs: Any,
|
|
58
|
+
) -> Path:
|
|
38
59
|
# To avoid new line issues when migrating from volumes into delta tables, omit indenting
|
|
39
60
|
# and always write it as a json file
|
|
61
|
+
output_dir.mkdir(exist_ok=True, parents=True)
|
|
62
|
+
output_path = output_dir / output_filename
|
|
40
63
|
final_output_path = output_path.with_suffix(".json")
|
|
64
|
+
data = get_json_data(path=elements_filepath)
|
|
65
|
+
for element in data:
|
|
66
|
+
element["id"] = get_enhanced_element_id(element_dict=element, file_data=file_data)
|
|
67
|
+
element[RECORD_ID_LABEL] = file_data.identifier
|
|
68
|
+
element["metadata"] = json.dumps(element.get("metadata", {}))
|
|
41
69
|
write_data(path=final_output_path, data=data, indent=None)
|
|
42
70
|
return final_output_path
|
|
43
71
|
|
|
@@ -49,6 +77,29 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
49
77
|
connector_type: str = CONNECTOR_TYPE
|
|
50
78
|
_columns: Optional[dict[str, str]] = None
|
|
51
79
|
|
|
80
|
+
def init(self, **kwargs: Any) -> None:
|
|
81
|
+
self.create_destination(**kwargs)
|
|
82
|
+
|
|
83
|
+
def create_destination(
|
|
84
|
+
self, destination_name: str = "unstructuredautocreated", **kwargs: Any
|
|
85
|
+
) -> bool:
|
|
86
|
+
table_name = self.upload_config.table_name or destination_name
|
|
87
|
+
self.upload_config.table_name = table_name
|
|
88
|
+
connectors_dir = Path(__file__).parents[1]
|
|
89
|
+
collection_config_file = connectors_dir / "assets" / "databricks_delta_table_schema.sql"
|
|
90
|
+
with self.get_cursor() as cursor:
|
|
91
|
+
cursor.execute("SHOW TABLES")
|
|
92
|
+
table_names = [r[1] for r in cursor.fetchall()]
|
|
93
|
+
if table_name in table_names:
|
|
94
|
+
return False
|
|
95
|
+
with collection_config_file.open() as schema_file:
|
|
96
|
+
data_lines = schema_file.readlines()
|
|
97
|
+
data_lines[0] = data_lines[0].replace("elements", table_name)
|
|
98
|
+
destination_schema = "".join([line.strip() for line in data_lines])
|
|
99
|
+
logger.info(f"creating table {table_name} for user")
|
|
100
|
+
cursor.execute(destination_schema)
|
|
101
|
+
return True
|
|
102
|
+
|
|
52
103
|
def precheck(self) -> None:
|
|
53
104
|
with self.connection_config.get_cursor() as cursor:
|
|
54
105
|
cursor.execute("SHOW CATALOGS")
|
|
@@ -68,14 +119,6 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
68
119
|
self.upload_config.database, ", ".join(databases)
|
|
69
120
|
)
|
|
70
121
|
)
|
|
71
|
-
cursor.execute(f"SHOW TABLES IN {self.upload_config.database}")
|
|
72
|
-
table_names = [r[1] for r in cursor.fetchall()]
|
|
73
|
-
if self.upload_config.table_name not in table_names:
|
|
74
|
-
raise ValueError(
|
|
75
|
-
"Table {} not found in {}".format(
|
|
76
|
-
self.upload_config.table_name, ", ".join(table_names)
|
|
77
|
-
)
|
|
78
|
-
)
|
|
79
122
|
|
|
80
123
|
def get_output_path(self, file_data: FileData, suffix: str = ".json") -> str:
|
|
81
124
|
filename = Path(file_data.source_identifiers.filename)
|
|
@@ -98,51 +141,42 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
|
|
|
98
141
|
self._columns = {desc[0]: desc[1] for desc in cursor.description}
|
|
99
142
|
return self._columns
|
|
100
143
|
|
|
101
|
-
def
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
f"{
|
|
114
|
-
)
|
|
115
|
-
if missing_columns and add_missing_columns:
|
|
116
|
-
logger.info(
|
|
117
|
-
"Following null filled columns will be added to match the table's schema:"
|
|
118
|
-
f" {', '.join(missing_columns)} "
|
|
144
|
+
def can_delete(self) -> bool:
|
|
145
|
+
existing_columns = self.get_table_columns()
|
|
146
|
+
return RECORD_ID_LABEL in existing_columns
|
|
147
|
+
|
|
148
|
+
def delete_previous_content(self, file_data: FileData) -> None:
|
|
149
|
+
logger.debug(
|
|
150
|
+
f"deleting any content with metadata "
|
|
151
|
+
f"{RECORD_ID_LABEL}={file_data.identifier} "
|
|
152
|
+
f"from delta table: {self.upload_config.table_name}"
|
|
153
|
+
)
|
|
154
|
+
with self.get_cursor() as cursor:
|
|
155
|
+
cursor.execute(
|
|
156
|
+
f"DELETE FROM {self.upload_config.table_name} WHERE {RECORD_ID_LABEL} = '{file_data.identifier}'" # noqa: E501
|
|
119
157
|
)
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
if add_missing_columns:
|
|
124
|
-
for column in missing_columns:
|
|
125
|
-
df[column] = pd.Series()
|
|
126
|
-
return df
|
|
158
|
+
results = cursor.fetchall()
|
|
159
|
+
deleted_rows = results[0][0]
|
|
160
|
+
logger.debug(f"deleted {deleted_rows} rows from table {self.upload_config.table_name}")
|
|
127
161
|
|
|
128
162
|
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
163
|
+
if self.can_delete():
|
|
164
|
+
self.delete_previous_content(file_data=file_data)
|
|
165
|
+
with self.get_cursor(staging_allowed_local_path=path.parent.as_posix()) as cursor:
|
|
166
|
+
catalog_path = self.get_output_path(file_data=file_data)
|
|
167
|
+
logger.debug(f"uploading {path.as_posix()} to {catalog_path}")
|
|
168
|
+
cursor.execute(f"PUT '{path.as_posix()}' INTO '{catalog_path}' OVERWRITE")
|
|
169
|
+
logger.debug(
|
|
170
|
+
f"migrating content from {catalog_path} to "
|
|
171
|
+
f"table {self.upload_config.table_name}"
|
|
172
|
+
)
|
|
173
|
+
data = get_json_data(path=path)
|
|
174
|
+
columns = data[0].keys()
|
|
175
|
+
select_columns = ["PARSE_JSON(metadata)" if c == "metadata" else c for c in columns]
|
|
176
|
+
column_str = ", ".join(columns)
|
|
177
|
+
select_column_str = ", ".join(select_columns)
|
|
178
|
+
sql_statment = f"INSERT INTO `{self.upload_config.table_name}` ({column_str}) SELECT {select_column_str} FROM json.`{catalog_path}`" # noqa: E501
|
|
179
|
+
cursor.execute(sql_statment)
|
|
146
180
|
|
|
147
181
|
|
|
148
182
|
databricks_volumes_delta_tables_destination_entry = DestinationRegistryEntry(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,12 +22,12 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
-
Requires-Dist:
|
|
25
|
+
Requires-Dist: click
|
|
26
|
+
Requires-Dist: python-dateutil
|
|
26
27
|
Requires-Dist: dataclasses_json
|
|
27
28
|
Requires-Dist: pydantic>=2.7
|
|
28
|
-
Requires-Dist: python-dateutil
|
|
29
|
-
Requires-Dist: click
|
|
30
29
|
Requires-Dist: tqdm
|
|
30
|
+
Requires-Dist: opentelemetry-sdk
|
|
31
31
|
Requires-Dist: pandas
|
|
32
32
|
Requires-Dist: numpy
|
|
33
33
|
Provides-Extra: remote
|
|
@@ -103,8 +103,8 @@ Requires-Dist: astrapy; extra == "astradb"
|
|
|
103
103
|
Requires-Dist: pandas; extra == "astradb"
|
|
104
104
|
Requires-Dist: numpy; extra == "astradb"
|
|
105
105
|
Provides-Extra: azure
|
|
106
|
-
Requires-Dist: adlfs; extra == "azure"
|
|
107
106
|
Requires-Dist: fsspec; extra == "azure"
|
|
107
|
+
Requires-Dist: adlfs; extra == "azure"
|
|
108
108
|
Requires-Dist: pandas; extra == "azure"
|
|
109
109
|
Requires-Dist: numpy; extra == "azure"
|
|
110
110
|
Provides-Extra: azure-ai-search
|
|
@@ -117,8 +117,8 @@ Requires-Dist: bs4; extra == "biomed"
|
|
|
117
117
|
Requires-Dist: pandas; extra == "biomed"
|
|
118
118
|
Requires-Dist: numpy; extra == "biomed"
|
|
119
119
|
Provides-Extra: box
|
|
120
|
-
Requires-Dist: boxfs; extra == "box"
|
|
121
120
|
Requires-Dist: fsspec; extra == "box"
|
|
121
|
+
Requires-Dist: boxfs; extra == "box"
|
|
122
122
|
Requires-Dist: pandas; extra == "box"
|
|
123
123
|
Requires-Dist: numpy; extra == "box"
|
|
124
124
|
Provides-Extra: chroma
|
|
@@ -130,8 +130,8 @@ Requires-Dist: clarifai; extra == "clarifai"
|
|
|
130
130
|
Requires-Dist: pandas; extra == "clarifai"
|
|
131
131
|
Requires-Dist: numpy; extra == "clarifai"
|
|
132
132
|
Provides-Extra: confluence
|
|
133
|
-
Requires-Dist: requests; extra == "confluence"
|
|
134
133
|
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
134
|
+
Requires-Dist: requests; extra == "confluence"
|
|
135
135
|
Requires-Dist: pandas; extra == "confluence"
|
|
136
136
|
Requires-Dist: numpy; extra == "confluence"
|
|
137
137
|
Provides-Extra: couchbase
|
|
@@ -161,14 +161,14 @@ Requires-Dist: elasticsearch[async]; extra == "elasticsearch"
|
|
|
161
161
|
Requires-Dist: pandas; extra == "elasticsearch"
|
|
162
162
|
Requires-Dist: numpy; extra == "elasticsearch"
|
|
163
163
|
Provides-Extra: gcs
|
|
164
|
-
Requires-Dist: gcsfs; extra == "gcs"
|
|
165
164
|
Requires-Dist: fsspec; extra == "gcs"
|
|
166
165
|
Requires-Dist: bs4; extra == "gcs"
|
|
166
|
+
Requires-Dist: gcsfs; extra == "gcs"
|
|
167
167
|
Requires-Dist: pandas; extra == "gcs"
|
|
168
168
|
Requires-Dist: numpy; extra == "gcs"
|
|
169
169
|
Provides-Extra: github
|
|
170
|
-
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
171
170
|
Requires-Dist: requests; extra == "github"
|
|
171
|
+
Requires-Dist: pygithub>1.58.0; extra == "github"
|
|
172
172
|
Requires-Dist: pandas; extra == "github"
|
|
173
173
|
Requires-Dist: numpy; extra == "github"
|
|
174
174
|
Provides-Extra: gitlab
|
|
@@ -185,10 +185,10 @@ Requires-Dist: urllib3; extra == "hubspot"
|
|
|
185
185
|
Requires-Dist: pandas; extra == "hubspot"
|
|
186
186
|
Requires-Dist: numpy; extra == "hubspot"
|
|
187
187
|
Provides-Extra: ibm-watsonx-s3
|
|
188
|
-
Requires-Dist:
|
|
188
|
+
Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
|
|
189
189
|
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
190
190
|
Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
|
|
191
|
-
Requires-Dist:
|
|
191
|
+
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
192
192
|
Requires-Dist: pandas; extra == "ibm-watsonx-s3"
|
|
193
193
|
Requires-Dist: numpy; extra == "ibm-watsonx-s3"
|
|
194
194
|
Provides-Extra: jira
|
|
@@ -216,21 +216,21 @@ Requires-Dist: pymongo; extra == "mongodb"
|
|
|
216
216
|
Requires-Dist: pandas; extra == "mongodb"
|
|
217
217
|
Requires-Dist: numpy; extra == "mongodb"
|
|
218
218
|
Provides-Extra: neo4j
|
|
219
|
-
Requires-Dist: networkx; extra == "neo4j"
|
|
220
|
-
Requires-Dist: cymple; extra == "neo4j"
|
|
221
219
|
Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
220
|
+
Requires-Dist: cymple; extra == "neo4j"
|
|
221
|
+
Requires-Dist: networkx; extra == "neo4j"
|
|
222
222
|
Requires-Dist: pandas; extra == "neo4j"
|
|
223
223
|
Requires-Dist: numpy; extra == "neo4j"
|
|
224
224
|
Provides-Extra: notion
|
|
225
225
|
Requires-Dist: backoff; extra == "notion"
|
|
226
|
-
Requires-Dist: httpx; extra == "notion"
|
|
227
226
|
Requires-Dist: notion-client; extra == "notion"
|
|
227
|
+
Requires-Dist: httpx; extra == "notion"
|
|
228
228
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
229
229
|
Requires-Dist: pandas; extra == "notion"
|
|
230
230
|
Requires-Dist: numpy; extra == "notion"
|
|
231
231
|
Provides-Extra: onedrive
|
|
232
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
233
232
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
233
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
234
234
|
Requires-Dist: bs4; extra == "onedrive"
|
|
235
235
|
Requires-Dist: pandas; extra == "onedrive"
|
|
236
236
|
Requires-Dist: numpy; extra == "onedrive"
|
|
@@ -239,8 +239,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
|
239
239
|
Requires-Dist: pandas; extra == "opensearch"
|
|
240
240
|
Requires-Dist: numpy; extra == "opensearch"
|
|
241
241
|
Provides-Extra: outlook
|
|
242
|
-
Requires-Dist: msal; extra == "outlook"
|
|
243
242
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
243
|
+
Requires-Dist: msal; extra == "outlook"
|
|
244
244
|
Requires-Dist: pandas; extra == "outlook"
|
|
245
245
|
Requires-Dist: numpy; extra == "outlook"
|
|
246
246
|
Provides-Extra: pinecone
|
|
@@ -269,8 +269,8 @@ Requires-Dist: s3fs; extra == "s3"
|
|
|
269
269
|
Requires-Dist: pandas; extra == "s3"
|
|
270
270
|
Requires-Dist: numpy; extra == "s3"
|
|
271
271
|
Provides-Extra: sharepoint
|
|
272
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
273
272
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
273
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
274
274
|
Requires-Dist: pandas; extra == "sharepoint"
|
|
275
275
|
Requires-Dist: numpy; extra == "sharepoint"
|
|
276
276
|
Provides-Extra: salesforce
|
|
@@ -278,8 +278,8 @@ Requires-Dist: simple-salesforce; extra == "salesforce"
|
|
|
278
278
|
Requires-Dist: pandas; extra == "salesforce"
|
|
279
279
|
Requires-Dist: numpy; extra == "salesforce"
|
|
280
280
|
Provides-Extra: sftp
|
|
281
|
-
Requires-Dist: fsspec; extra == "sftp"
|
|
282
281
|
Requires-Dist: paramiko; extra == "sftp"
|
|
282
|
+
Requires-Dist: fsspec; extra == "sftp"
|
|
283
283
|
Requires-Dist: pandas; extra == "sftp"
|
|
284
284
|
Requires-Dist: numpy; extra == "sftp"
|
|
285
285
|
Provides-Extra: slack
|
|
@@ -287,8 +287,8 @@ Requires-Dist: slack_sdk[optional]; extra == "slack"
|
|
|
287
287
|
Requires-Dist: pandas; extra == "slack"
|
|
288
288
|
Requires-Dist: numpy; extra == "slack"
|
|
289
289
|
Provides-Extra: snowflake
|
|
290
|
-
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
291
290
|
Requires-Dist: snowflake-connector-python; extra == "snowflake"
|
|
291
|
+
Requires-Dist: psycopg2-binary; extra == "snowflake"
|
|
292
292
|
Requires-Dist: pandas; extra == "snowflake"
|
|
293
293
|
Requires-Dist: numpy; extra == "snowflake"
|
|
294
294
|
Provides-Extra: wikipedia
|
|
@@ -312,21 +312,21 @@ Requires-Dist: singlestoredb; extra == "singlestore"
|
|
|
312
312
|
Requires-Dist: pandas; extra == "singlestore"
|
|
313
313
|
Requires-Dist: numpy; extra == "singlestore"
|
|
314
314
|
Provides-Extra: vectara
|
|
315
|
-
Requires-Dist: httpx; extra == "vectara"
|
|
316
315
|
Requires-Dist: requests; extra == "vectara"
|
|
316
|
+
Requires-Dist: httpx; extra == "vectara"
|
|
317
317
|
Requires-Dist: aiofiles; extra == "vectara"
|
|
318
318
|
Requires-Dist: pandas; extra == "vectara"
|
|
319
319
|
Requires-Dist: numpy; extra == "vectara"
|
|
320
320
|
Provides-Extra: vastdb
|
|
321
|
-
Requires-Dist: ibis; extra == "vastdb"
|
|
322
321
|
Requires-Dist: pyarrow; extra == "vastdb"
|
|
322
|
+
Requires-Dist: ibis; extra == "vastdb"
|
|
323
323
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
324
324
|
Requires-Dist: pandas; extra == "vastdb"
|
|
325
325
|
Requires-Dist: numpy; extra == "vastdb"
|
|
326
326
|
Provides-Extra: zendesk
|
|
327
327
|
Requires-Dist: httpx; extra == "zendesk"
|
|
328
|
-
Requires-Dist: aiofiles; extra == "zendesk"
|
|
329
328
|
Requires-Dist: bs4; extra == "zendesk"
|
|
329
|
+
Requires-Dist: aiofiles; extra == "zendesk"
|
|
330
330
|
Requires-Dist: pandas; extra == "zendesk"
|
|
331
331
|
Requires-Dist: numpy; extra == "zendesk"
|
|
332
332
|
Provides-Extra: embed-huggingface
|
|
@@ -356,8 +356,8 @@ Requires-Dist: tiktoken; extra == "openai"
|
|
|
356
356
|
Requires-Dist: pandas; extra == "openai"
|
|
357
357
|
Requires-Dist: numpy; extra == "openai"
|
|
358
358
|
Provides-Extra: bedrock
|
|
359
|
-
Requires-Dist: aioboto3; extra == "bedrock"
|
|
360
359
|
Requires-Dist: boto3; extra == "bedrock"
|
|
360
|
+
Requires-Dist: aioboto3; extra == "bedrock"
|
|
361
361
|
Requires-Dist: pandas; extra == "bedrock"
|
|
362
362
|
Requires-Dist: numpy; extra == "bedrock"
|
|
363
363
|
Provides-Extra: togetherai
|
|
@@ -91,8 +91,6 @@ test/unit/v2/chunkers/test_chunkers.py,sha256=HSr3_lsoMw1nkDhkjO0-NOTEomRdR9oxCr
|
|
|
91
91
|
test/unit/v2/connectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
92
92
|
test/unit/v2/connectors/test_confluence.py,sha256=lN6nnU5qOtmsjIGcz65roepm76w4vPF7AmSzi9vqV78,1919
|
|
93
93
|
test/unit/v2/connectors/test_jira.py,sha256=XEBBDSdNZWUVO5JbpiSsjazJYmbLsgXUOW-APqPRKLg,12113
|
|
94
|
-
test/unit/v2/connectors/databricks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
95
|
-
test/unit/v2/connectors/databricks/test_volumes_table.py,sha256=-R_EJHqv1BseGRK9VRAZhF-2EXA64LAlhycoyIu556U,1078
|
|
96
94
|
test/unit/v2/connectors/ibm_watsonx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
97
95
|
test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py,sha256=WKpDKvEGalh8LYRqN9xA7CfMPOPHo_VcZbnCXdkVjho,14513
|
|
98
96
|
test/unit/v2/connectors/motherduck/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -113,7 +111,7 @@ test/unit/v2/partitioners/test_partitioner.py,sha256=iIYg7IpftV3LusoO4H8tr1IHY1U
|
|
|
113
111
|
test/unit/v2/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
112
|
test/unit/v2/utils/data_generator.py,sha256=UoYVNjG4S4wlaA9gceQ82HIpF9_6I1UTHD1_GrQBHp0,973
|
|
115
113
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
116
|
-
unstructured_ingest/__version__.py,sha256=
|
|
114
|
+
unstructured_ingest/__version__.py,sha256=UDy7drjkPUljex5sEiDR3ZALQNnlcrCXwJShdKZ37Ek,42
|
|
117
115
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
118
116
|
unstructured_ingest/interfaces.py,sha256=7DOnDpGvUNlCoFR7UPRGmOarqH5sFtuUOO5vf8X3oTM,31489
|
|
119
117
|
unstructured_ingest/logger.py,sha256=S5nSqGcABoQyeicgRnBQFjDScCaTvFVivOCvbo-laL0,4479
|
|
@@ -421,7 +419,7 @@ unstructured_ingest/v2/pipeline/steps/stage.py,sha256=_0BN2i273y_fZyvSUPOOeXv4kL
|
|
|
421
419
|
unstructured_ingest/v2/pipeline/steps/uncompress.py,sha256=I9TyqMCUSxlf2kdPADjeH4TrUTSe0FMTlARp9QD6TsE,1763
|
|
422
420
|
unstructured_ingest/v2/pipeline/steps/upload.py,sha256=6x8SUdnydR76K6cR3nUVupOACIx-XsRV3vXRlebolqg,1996
|
|
423
421
|
unstructured_ingest/v2/processes/__init__.py,sha256=FaHWSCGyc7GWVnAsNEUUj7L8hT8gCVY3_hUE2VzWtUg,462
|
|
424
|
-
unstructured_ingest/v2/processes/chunker.py,sha256=
|
|
422
|
+
unstructured_ingest/v2/processes/chunker.py,sha256=O5FN8KWym79H0dtKZvW7ABgn4bwKtaeUO8meGdjM2Yo,5609
|
|
425
423
|
unstructured_ingest/v2/processes/connector_registry.py,sha256=vkEe6jpgdYtZCxMj59s5atWGgmPuxAEXRUoTt-MJ7wc,2198
|
|
426
424
|
unstructured_ingest/v2/processes/embedder.py,sha256=gvlCQDsbQVgcp-2f0Qq4RiFbcr8gJwIS-imgloE-UOc,7887
|
|
427
425
|
unstructured_ingest/v2/processes/filter.py,sha256=E1MLxk-XeCm3mZIuM49lJToVcSgOivmTFIZApqOEFs8,2150
|
|
@@ -454,14 +452,15 @@ unstructured_ingest/v2/processes/connectors/slack.py,sha256=vbBVCYEd741-n2v6eAXL
|
|
|
454
452
|
unstructured_ingest/v2/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
|
|
455
453
|
unstructured_ingest/v2/processes/connectors/vectara.py,sha256=KUqgZ6D2KUOrW596ms-EekvQYDh-fXqBTa7KG-leXoo,12301
|
|
456
454
|
unstructured_ingest/v2/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
455
|
+
unstructured_ingest/v2/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=dUZZDNkyvQXKqoAThRz3ek7zaUE2l_LAQimlG5WZhH4,211
|
|
457
456
|
unstructured_ingest/v2/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
|
|
458
457
|
unstructured_ingest/v2/processes/connectors/databricks/__init__.py,sha256=Oh8SwTWi66gO8BsNF6vRMoQVuegyBPPCpVozkOHEf3A,2136
|
|
459
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=
|
|
458
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes.py,sha256=EghKdkt4nGacGxulSpjhToHOl5BRLbb3xNZpJzpWNX8,8002
|
|
460
459
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py,sha256=h6qDxQhWlT7H4K1CEfKag1stTiD1o97VckJZERsofqU,2970
|
|
461
460
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py,sha256=gjICJJwhDHBLt_L-LrMlvJ3DL1DYtwFpyMLb_zYvOIg,3755
|
|
462
461
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py,sha256=Uss3XPPaq1AsqJOEy4RJgBJw2-bTjrXH2PgtVNYd2w0,3006
|
|
463
462
|
unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py,sha256=g1qYnIrML4TjN7rmC0MGrD5JzAprb6SymBHlEdOumz0,3113
|
|
464
|
-
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=
|
|
463
|
+
unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py,sha256=LiSb66039idaRtMnTuHjR5ZqvdmmIu3ByUgFQ1a3iZQ,8264
|
|
465
464
|
unstructured_ingest/v2/processes/connectors/duckdb/__init__.py,sha256=5sVvJCWhU-YkjHIwk4W6BZCanFYK5W4xTpWtQ8xzeB4,561
|
|
466
465
|
unstructured_ingest/v2/processes/connectors/duckdb/base.py,sha256=VCoQ3h289BO4A2kJKZXUVB0QOcaQif-HeRgg-xXzn10,2976
|
|
467
466
|
unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py,sha256=DM4pygQAnP-dtuFEFAVeBfGt0pzrfkltteCai0GKnG0,4439
|
|
@@ -582,9 +581,9 @@ unstructured_ingest/v2/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JC
|
|
|
582
581
|
unstructured_ingest/v2/processes/utils/blob_storage.py,sha256=_I3OMdpUElQdIwVs7W9ORU1kncNaZ_nr6lbxeKE8uaU,1014
|
|
583
582
|
unstructured_ingest/v2/types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
584
583
|
unstructured_ingest/v2/types/file_data.py,sha256=kowOhvYy0q_-khX3IuR111AfjkdQezEfxjzK6QDH7oA,3836
|
|
585
|
-
unstructured_ingest-0.6.
|
|
586
|
-
unstructured_ingest-0.6.
|
|
587
|
-
unstructured_ingest-0.6.
|
|
588
|
-
unstructured_ingest-0.6.
|
|
589
|
-
unstructured_ingest-0.6.
|
|
590
|
-
unstructured_ingest-0.6.
|
|
584
|
+
unstructured_ingest-0.6.2.dist-info/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
585
|
+
unstructured_ingest-0.6.2.dist-info/METADATA,sha256=yUMpJD0UXDhUG1cIIpHkjn-VU2AScEaA12wLmISmG-A,14998
|
|
586
|
+
unstructured_ingest-0.6.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
587
|
+
unstructured_ingest-0.6.2.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
588
|
+
unstructured_ingest-0.6.2.dist-info/top_level.txt,sha256=DMuDMHZRMdeay8v8Kdi855muIv92F0OkutvBCaBEW6M,25
|
|
589
|
+
unstructured_ingest-0.6.2.dist-info/RECORD,,
|
|
File without changes
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
from pytest_mock import MockerFixture
|
|
5
|
-
|
|
6
|
-
from unstructured_ingest.v2.processes.connectors.databricks.volumes_table import (
|
|
7
|
-
DatabricksVolumeDeltaTableStager,
|
|
8
|
-
)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
@pytest.fixture
|
|
12
|
-
def stager():
|
|
13
|
-
return DatabricksVolumeDeltaTableStager()
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@pytest.mark.parametrize(
|
|
17
|
-
("output_path", "called_output_path"),
|
|
18
|
-
[
|
|
19
|
-
(
|
|
20
|
-
Path("/fake/path/output"),
|
|
21
|
-
Path("/fake/path/output.json"),
|
|
22
|
-
),
|
|
23
|
-
(
|
|
24
|
-
Path("/fake/path/output.ndjson"),
|
|
25
|
-
Path("/fake/path/output.json"),
|
|
26
|
-
),
|
|
27
|
-
],
|
|
28
|
-
)
|
|
29
|
-
def test_write_output(
|
|
30
|
-
mocker: MockerFixture,
|
|
31
|
-
stager: DatabricksVolumeDeltaTableStager,
|
|
32
|
-
output_path: Path,
|
|
33
|
-
called_output_path: Path,
|
|
34
|
-
):
|
|
35
|
-
data = [{"key1": "value1", "key2": "value2"}]
|
|
36
|
-
|
|
37
|
-
mock_get_data = mocker.patch(
|
|
38
|
-
"unstructured_ingest.v2.processes.connectors.databricks.volumes_table.write_data",
|
|
39
|
-
return_value=None,
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
stager.write_output(output_path, data)
|
|
43
|
-
|
|
44
|
-
mock_get_data.assert_called_once_with(path=called_output_path, data=data, indent=None)
|
|
File without changes
|
|
File without changes
|
{unstructured_ingest-0.6.0.dist-info → unstructured_ingest-0.6.2.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
|
File without changes
|