unstructured-ingest 1.0.40__py3-none-any.whl → 1.0.44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/embed/togetherai.py +1 -1
- unstructured_ingest/processes/connectors/airtable.py +1 -0
- unstructured_ingest/processes/connectors/astradb.py +7 -2
- unstructured_ingest/processes/connectors/confluence.py +6 -2
- unstructured_ingest/processes/connectors/databricks/volumes.py +7 -5
- unstructured_ingest/processes/connectors/delta_table.py +84 -30
- unstructured_ingest/processes/connectors/discord.py +1 -0
- unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py +9 -2
- unstructured_ingest/processes/connectors/gitlab.py +7 -6
- unstructured_ingest/processes/connectors/jira.py +1 -0
- unstructured_ingest/processes/connectors/local.py +11 -11
- unstructured_ingest/processes/connectors/mongodb.py +5 -0
- unstructured_ingest/processes/connectors/notion/connector.py +2 -0
- unstructured_ingest/processes/connectors/onedrive.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +3 -2
- unstructured_ingest/processes/connectors/salesforce.py +6 -4
- unstructured_ingest/processes/connectors/slack.py +5 -3
- unstructured_ingest/processes/connectors/sql/sql.py +8 -1
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +10 -6
- {unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/METADATA +2 -1
- {unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/RECORD +25 -25
- {unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/WHEEL +0 -0
- {unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.0.
|
|
1
|
+
__version__ = "1.0.44" # pragma: no cover
|
|
@@ -22,7 +22,7 @@ if TYPE_CHECKING:
|
|
|
22
22
|
class TogetherAIEmbeddingConfig(EmbeddingConfig):
|
|
23
23
|
api_key: SecretStr = Field(description="API key for Together AI")
|
|
24
24
|
embedder_model_name: str = Field(
|
|
25
|
-
default="togethercomputer/m2-bert-80M-
|
|
25
|
+
default="togethercomputer/m2-bert-80M-32k-retrieval",
|
|
26
26
|
alias="model_name",
|
|
27
27
|
description="Together AI model name",
|
|
28
28
|
)
|
|
@@ -195,8 +195,12 @@ class AstraDBIndexer(Indexer):
|
|
|
195
195
|
all_ids = self._get_doc_ids()
|
|
196
196
|
ids = list(all_ids)
|
|
197
197
|
id_batches = batch_generator(ids, self.index_config.batch_size)
|
|
198
|
-
|
|
199
198
|
for batch in id_batches:
|
|
199
|
+
batch_items = [BatchItem(identifier=b) for b in batch]
|
|
200
|
+
display_name = (
|
|
201
|
+
f"{self.index_config.collection_name}-{self.index_config.keyspace}"
|
|
202
|
+
f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]"
|
|
203
|
+
)
|
|
200
204
|
fd = AstraDBBatchFileData(
|
|
201
205
|
connector_type=CONNECTOR_TYPE,
|
|
202
206
|
metadata=FileDataSourceMetadata(
|
|
@@ -206,7 +210,8 @@ class AstraDBIndexer(Indexer):
|
|
|
206
210
|
collection_name=self.index_config.collection_name,
|
|
207
211
|
keyspace=self.index_config.keyspace,
|
|
208
212
|
),
|
|
209
|
-
batch_items=
|
|
213
|
+
batch_items=batch_items,
|
|
214
|
+
display_name=display_name,
|
|
210
215
|
)
|
|
211
216
|
yield fd
|
|
212
217
|
|
|
@@ -186,12 +186,15 @@ class ConfluenceIndexer(Indexer):
|
|
|
186
186
|
pages = client.get_all_pages_from_space(
|
|
187
187
|
space=space_key,
|
|
188
188
|
start=0,
|
|
189
|
-
limit=self.index_config.max_num_of_docs_from_each_space,
|
|
190
189
|
expand=None,
|
|
191
190
|
content_type="page", # blogpost and comment types not currently supported
|
|
192
191
|
status=None,
|
|
193
192
|
)
|
|
194
|
-
|
|
193
|
+
# Limit the number of documents to max_num_of_docs_from_each_space
|
|
194
|
+
# Note: this is needed because the limit field in client.get_all_pages_from_space does
|
|
195
|
+
# not seem to work as expected
|
|
196
|
+
limited_pages = pages[: self.index_config.max_num_of_docs_from_each_space]
|
|
197
|
+
doc_ids = [{"space_id": space_key, "doc_id": page["id"]} for page in limited_pages]
|
|
195
198
|
return doc_ids
|
|
196
199
|
|
|
197
200
|
def run(self) -> Generator[FileData, None, None]:
|
|
@@ -233,6 +236,7 @@ class ConfluenceIndexer(Indexer):
|
|
|
233
236
|
metadata=metadata,
|
|
234
237
|
additional_metadata=additional_metadata,
|
|
235
238
|
source_identifiers=source_identifiers,
|
|
239
|
+
display_name=source_identifiers.fullpath,
|
|
236
240
|
)
|
|
237
241
|
yield file_data
|
|
238
242
|
|
|
@@ -133,14 +133,15 @@ class DatabricksVolumesIndexer(Indexer, ABC):
|
|
|
133
133
|
if rel_path.startswith("/"):
|
|
134
134
|
rel_path = rel_path[1:]
|
|
135
135
|
filename = Path(file_info.path).name
|
|
136
|
+
source_identifiers = SourceIdentifiers(
|
|
137
|
+
filename=filename,
|
|
138
|
+
rel_path=rel_path,
|
|
139
|
+
fullpath=file_info.path,
|
|
140
|
+
)
|
|
136
141
|
yield FileData(
|
|
137
142
|
identifier=str(uuid5(NAMESPACE_DNS, file_info.path)),
|
|
138
143
|
connector_type=self.connector_type,
|
|
139
|
-
source_identifiers=
|
|
140
|
-
filename=filename,
|
|
141
|
-
rel_path=rel_path,
|
|
142
|
-
fullpath=file_info.path,
|
|
143
|
-
),
|
|
144
|
+
source_identifiers=source_identifiers,
|
|
144
145
|
additional_metadata={
|
|
145
146
|
"catalog": self.index_config.catalog,
|
|
146
147
|
"path": file_info.path,
|
|
@@ -148,6 +149,7 @@ class DatabricksVolumesIndexer(Indexer, ABC):
|
|
|
148
149
|
metadata=FileDataSourceMetadata(
|
|
149
150
|
url=file_info.path, date_modified=str(file_info.modification_time)
|
|
150
151
|
),
|
|
152
|
+
display_name=source_identifiers.fullpath,
|
|
151
153
|
)
|
|
152
154
|
except Exception as e:
|
|
153
155
|
raise self.connection_config.wrap_error(e=e)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import
|
|
1
|
+
import logging
|
|
2
2
|
import traceback
|
|
3
3
|
from dataclasses import dataclass, field
|
|
4
|
-
from multiprocessing import Process, Queue
|
|
4
|
+
from multiprocessing import Process, Queue, current_process
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import TYPE_CHECKING, Any, Optional
|
|
7
7
|
from urllib.parse import urlparse
|
|
@@ -20,6 +20,7 @@ from unstructured_ingest.interfaces import (
|
|
|
20
20
|
)
|
|
21
21
|
from unstructured_ingest.logger import logger
|
|
22
22
|
from unstructured_ingest.processes.connector_registry import DestinationRegistryEntry
|
|
23
|
+
from unstructured_ingest.utils.constants import RECORD_ID_LABEL
|
|
23
24
|
from unstructured_ingest.utils.data_prep import get_data_df, get_json_data
|
|
24
25
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
25
26
|
from unstructured_ingest.utils.table import convert_to_pandas_dataframe
|
|
@@ -47,18 +48,17 @@ class DeltaTableAccessConfig(AccessConfig):
|
|
|
47
48
|
|
|
48
49
|
class DeltaTableConnectionConfig(ConnectionConfig):
|
|
49
50
|
access_config: Secret[DeltaTableAccessConfig] = Field(
|
|
50
|
-
default=DeltaTableAccessConfig(), validate_default=True
|
|
51
|
+
default=Secret(DeltaTableAccessConfig()), validate_default=True
|
|
51
52
|
)
|
|
52
53
|
aws_region: Optional[str] = Field(default=None, description="AWS Region")
|
|
53
54
|
table_uri: str = Field(
|
|
54
|
-
default=None,
|
|
55
55
|
description=(
|
|
56
56
|
"Local path or path to the target folder in the S3 bucket, "
|
|
57
57
|
"formatted as s3://my-bucket/my-folder/"
|
|
58
58
|
),
|
|
59
59
|
)
|
|
60
60
|
|
|
61
|
-
def update_storage_options(self, storage_options: dict) -> None:
|
|
61
|
+
def update_storage_options(self, storage_options: dict[str, str]) -> None:
|
|
62
62
|
secrets = self.access_config.get_secret_value()
|
|
63
63
|
if self.aws_region and secrets.aws_access_key_id and secrets.aws_secret_access_key:
|
|
64
64
|
storage_options["AWS_REGION"] = self.aws_region
|
|
@@ -80,9 +80,10 @@ class DeltaTableUploadStager(UploadStager):
|
|
|
80
80
|
default_factory=lambda: DeltaTableUploadStagerConfig()
|
|
81
81
|
)
|
|
82
82
|
|
|
83
|
-
def run(
|
|
83
|
+
def run( # type: ignore[override]
|
|
84
84
|
self,
|
|
85
85
|
elements_filepath: Path,
|
|
86
|
+
file_data: FileData,
|
|
86
87
|
output_dir: Path,
|
|
87
88
|
output_filename: str,
|
|
88
89
|
**kwargs: Any,
|
|
@@ -91,6 +92,8 @@ class DeltaTableUploadStager(UploadStager):
|
|
|
91
92
|
output_path = Path(output_dir) / Path(f"{output_filename}.parquet")
|
|
92
93
|
|
|
93
94
|
df = convert_to_pandas_dataframe(elements_dict=elements_contents)
|
|
95
|
+
# Ensure per-record overwrite/delete semantics: tag each row with the record identifier
|
|
96
|
+
df[RECORD_ID_LABEL] = file_data.identifier
|
|
94
97
|
df = df.dropna(axis=1, how="all")
|
|
95
98
|
df.to_parquet(output_path)
|
|
96
99
|
|
|
@@ -138,41 +141,92 @@ class DeltaTableUploader(Uploader):
|
|
|
138
141
|
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
139
142
|
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
140
143
|
|
|
144
|
+
@requires_dependencies(["tenacity"], extras="delta-table")
|
|
141
145
|
def upload_dataframe(self, df: "DataFrame", file_data: FileData) -> None:
|
|
142
|
-
|
|
143
|
-
self.connection_config.table_uri, file_data.source_identifiers.relative_path
|
|
144
|
-
)
|
|
146
|
+
upload_path = self.connection_config.table_uri
|
|
145
147
|
logger.info(
|
|
146
|
-
f"writing {len(df)} rows to destination table "
|
|
147
|
-
f"at {updated_upload_path}\ndtypes: {df.dtypes}",
|
|
148
|
+
f"writing {len(df)} rows to destination table at {upload_path}\ndtypes: {df.dtypes}",
|
|
148
149
|
)
|
|
149
|
-
storage_options = {}
|
|
150
|
+
storage_options: dict[str, str] = {}
|
|
150
151
|
self.connection_config.update_storage_options(storage_options=storage_options)
|
|
151
152
|
|
|
153
|
+
# Decide whether the Delta table already exists. If it does, we first delete all rows
|
|
154
|
+
# belonging to the current record and then append the fresh data. Otherwise we will
|
|
155
|
+
# create a brand-new table via an overwrite.
|
|
156
|
+
|
|
157
|
+
mode = "overwrite"
|
|
158
|
+
try:
|
|
159
|
+
from deltalake import DeltaTable # pylint: disable=import-error
|
|
160
|
+
|
|
161
|
+
dt = DeltaTable(upload_path, storage_options=storage_options)
|
|
162
|
+
logger.debug(f"Table exists: deleting rows for {file_data.identifier}")
|
|
163
|
+
# Table exists – remove any previous rows for this record_id so that appending is
|
|
164
|
+
# effectively an idempotent overwrite for the record.
|
|
165
|
+
dt.delete(predicate=f"{RECORD_ID_LABEL} = '{file_data.identifier}'")
|
|
166
|
+
mode = "append"
|
|
167
|
+
except Exception:
|
|
168
|
+
# Table does not exist yet (or cannot be opened) – we will create it below with
|
|
169
|
+
# mode="overwrite". All other failures will be captured later by the writer.
|
|
170
|
+
logger.debug("Table does not exist: creating new table")
|
|
171
|
+
|
|
152
172
|
writer_kwargs = {
|
|
153
|
-
"table_or_uri":
|
|
173
|
+
"table_or_uri": upload_path,
|
|
154
174
|
"data": df,
|
|
155
|
-
"mode":
|
|
175
|
+
"mode": mode,
|
|
156
176
|
"schema_mode": "merge",
|
|
157
177
|
"storage_options": storage_options,
|
|
158
178
|
}
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
kwargs={"queue": queue, **writer_kwargs},
|
|
179
|
+
|
|
180
|
+
from tenacity import (
|
|
181
|
+
before_log,
|
|
182
|
+
retry,
|
|
183
|
+
retry_if_exception,
|
|
184
|
+
stop_after_attempt,
|
|
185
|
+
wait_random,
|
|
167
186
|
)
|
|
168
|
-
writer.start()
|
|
169
|
-
writer.join()
|
|
170
187
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
188
|
+
def _is_commit_conflict(exc: BaseException) -> bool: # noqa: ANN401
|
|
189
|
+
"""Return True if exception looks like a Delta Lake commit conflict."""
|
|
190
|
+
|
|
191
|
+
return isinstance(exc, RuntimeError) and (
|
|
192
|
+
"CommitFailed" in str(exc) or "Metadata changed" in str(exc)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
@retry(
|
|
196
|
+
stop=stop_after_attempt(10),
|
|
197
|
+
wait=wait_random(min=0.2, max=1.0),
|
|
198
|
+
before=before_log(logger, logging.DEBUG),
|
|
199
|
+
retry=retry_if_exception(_is_commit_conflict),
|
|
200
|
+
reraise=True,
|
|
201
|
+
)
|
|
202
|
+
def _single_attempt() -> None:
|
|
203
|
+
"""One optimistic transaction: delete old rows, then append new ones."""
|
|
204
|
+
|
|
205
|
+
# NOTE: deltalake writer on Linux sometimes can finish but still trigger a SIGABRT and
|
|
206
|
+
# cause ingest to fail, even though all tasks are completed normally. Putting the writer
|
|
207
|
+
# into a process mitigates this issue by ensuring python interpreter waits properly for
|
|
208
|
+
# deltalake's rust backend to finish
|
|
209
|
+
queue: Queue[str] = Queue()
|
|
210
|
+
|
|
211
|
+
if current_process().daemon:
|
|
212
|
+
# write_deltalake_with_error_handling will push any traceback to our queue
|
|
213
|
+
write_deltalake_with_error_handling(queue=queue, **writer_kwargs)
|
|
214
|
+
else:
|
|
215
|
+
# On non-daemon processes we still guard against SIGABRT by running in a subprocess.
|
|
216
|
+
writer = Process(
|
|
217
|
+
target=write_deltalake_with_error_handling,
|
|
218
|
+
kwargs={"queue": queue, **writer_kwargs},
|
|
219
|
+
)
|
|
220
|
+
writer.start()
|
|
221
|
+
writer.join()
|
|
222
|
+
|
|
223
|
+
# Check if the queue has any exception message
|
|
224
|
+
if not queue.empty():
|
|
225
|
+
error_message = queue.get()
|
|
226
|
+
logger.error("Exception occurred in write_deltalake: %s", error_message)
|
|
227
|
+
raise RuntimeError(f"Error in write_deltalake: {error_message}")
|
|
228
|
+
|
|
229
|
+
_single_attempt()
|
|
176
230
|
|
|
177
231
|
@requires_dependencies(["pandas"], extras="delta-table")
|
|
178
232
|
def run_data(self, data: list[dict], file_data: FileData, **kwargs: Any) -> None:
|
|
@@ -182,7 +236,7 @@ class DeltaTableUploader(Uploader):
|
|
|
182
236
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
183
237
|
|
|
184
238
|
@requires_dependencies(["pandas"], extras="delta-table")
|
|
185
|
-
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
|
|
239
|
+
def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None: # type: ignore[override]
|
|
186
240
|
df = get_data_df(path)
|
|
187
241
|
self.upload_dataframe(df=df, file_data=file_data)
|
|
188
242
|
|
|
@@ -199,17 +199,24 @@ class ElasticsearchIndexer(Indexer):
|
|
|
199
199
|
all_ids = self._get_doc_ids()
|
|
200
200
|
ids = list(all_ids)
|
|
201
201
|
for batch in batch_generator(ids, self.index_config.batch_size):
|
|
202
|
+
batch_items = [BatchItem(identifier=b) for b in batch]
|
|
203
|
+
url = f"{self.connection_config.hosts[0]}/{self.index_config.index_name}"
|
|
204
|
+
display_name = (
|
|
205
|
+
f"url={url}, batch_size={len(batch_items)} "
|
|
206
|
+
f"ids={batch_items[0].identifier}..{batch_items[-1].identifier}"
|
|
207
|
+
) # noqa: E501
|
|
202
208
|
# Make sure the hash is always a positive number to create identified
|
|
203
209
|
yield ElasticsearchBatchFileData(
|
|
204
210
|
connector_type=CONNECTOR_TYPE,
|
|
205
211
|
metadata=FileDataSourceMetadata(
|
|
206
|
-
url=
|
|
212
|
+
url=url,
|
|
207
213
|
date_processed=str(time()),
|
|
208
214
|
),
|
|
209
215
|
additional_metadata=ElastisearchAdditionalMetadata(
|
|
210
216
|
index_name=self.index_config.index_name,
|
|
211
217
|
),
|
|
212
|
-
batch_items=
|
|
218
|
+
batch_items=batch_items,
|
|
219
|
+
display_name=display_name,
|
|
213
220
|
)
|
|
214
221
|
|
|
215
222
|
|
|
@@ -190,21 +190,22 @@ class GitLabIndexer(Indexer):
|
|
|
190
190
|
"file_path": file["path"],
|
|
191
191
|
"ref": ref,
|
|
192
192
|
}
|
|
193
|
-
|
|
193
|
+
source_identifiers = SourceIdentifiers(
|
|
194
|
+
fullpath=file["path"],
|
|
195
|
+
filename=Path(file["path"]).name,
|
|
196
|
+
rel_path=relative_path,
|
|
197
|
+
)
|
|
194
198
|
yield FileData(
|
|
195
199
|
identifier=file["id"],
|
|
196
200
|
connector_type=CONNECTOR_TYPE,
|
|
197
|
-
source_identifiers=
|
|
198
|
-
fullpath=file["path"],
|
|
199
|
-
filename=Path(file["path"]).name,
|
|
200
|
-
rel_path=relative_path,
|
|
201
|
-
),
|
|
201
|
+
source_identifiers=source_identifiers,
|
|
202
202
|
metadata=FileDataSourceMetadata(
|
|
203
203
|
url=file["id"],
|
|
204
204
|
record_locator=record_locator,
|
|
205
205
|
permissions_data=[{"mode": file["mode"]}],
|
|
206
206
|
),
|
|
207
207
|
additional_metadata={},
|
|
208
|
+
display_name=source_identifiers.fullpath,
|
|
208
209
|
)
|
|
209
210
|
|
|
210
211
|
|
|
@@ -119,21 +119,21 @@ class LocalIndexer(Indexer):
|
|
|
119
119
|
|
|
120
120
|
def run(self, **kwargs: Any) -> Generator[FileData, None, None]:
|
|
121
121
|
for file_path in self.list_files():
|
|
122
|
+
source_identifiers = SourceIdentifiers(
|
|
123
|
+
fullpath=str(file_path.resolve()),
|
|
124
|
+
filename=file_path.name,
|
|
125
|
+
rel_path=(
|
|
126
|
+
str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[1:]
|
|
127
|
+
if not self.index_config.path.is_file()
|
|
128
|
+
else self.index_config.path.name
|
|
129
|
+
),
|
|
130
|
+
)
|
|
122
131
|
file_data = FileData(
|
|
123
132
|
identifier=str(file_path.resolve()),
|
|
124
133
|
connector_type=CONNECTOR_TYPE,
|
|
125
|
-
source_identifiers=
|
|
126
|
-
fullpath=str(file_path.resolve()),
|
|
127
|
-
filename=file_path.name,
|
|
128
|
-
rel_path=(
|
|
129
|
-
str(file_path.resolve()).replace(str(self.index_config.path.resolve()), "")[
|
|
130
|
-
1:
|
|
131
|
-
]
|
|
132
|
-
if not self.index_config.path.is_file()
|
|
133
|
-
else self.index_config.path.name
|
|
134
|
-
),
|
|
135
|
-
),
|
|
134
|
+
source_identifiers=source_identifiers,
|
|
136
135
|
metadata=self.get_file_metadata(path=file_path),
|
|
136
|
+
display_name=source_identifiers.fullpath,
|
|
137
137
|
)
|
|
138
138
|
yield file_data
|
|
139
139
|
|
|
@@ -149,6 +149,10 @@ class MongoDBIndexer(Indexer):
|
|
|
149
149
|
|
|
150
150
|
for id_batch in batch_generator(ids, batch_size=batch_size):
|
|
151
151
|
# Make sure the hash is always a positive number to create identifier
|
|
152
|
+
display_name = (
|
|
153
|
+
f"{self.index_config.database}.{self.index_config.collection}, "
|
|
154
|
+
f"batch {id_batch[0]}-{id_batch[-1]}"
|
|
155
|
+
)
|
|
152
156
|
metadata = FileDataSourceMetadata(
|
|
153
157
|
date_processed=str(time()),
|
|
154
158
|
record_locator={
|
|
@@ -164,6 +168,7 @@ class MongoDBIndexer(Indexer):
|
|
|
164
168
|
additional_metadata=MongoDBAdditionalMetadata(
|
|
165
169
|
collection=self.index_config.collection, database=self.index_config.database
|
|
166
170
|
),
|
|
171
|
+
display_name=display_name,
|
|
167
172
|
)
|
|
168
173
|
yield file_data
|
|
169
174
|
|
|
@@ -174,6 +174,7 @@ class NotionIndexer(Indexer):
|
|
|
174
174
|
source_identifiers=source_identifiers,
|
|
175
175
|
metadata=metadata,
|
|
176
176
|
additional_metadata=additional_metadata,
|
|
177
|
+
display_name=source_identifiers.fullpath,
|
|
177
178
|
)
|
|
178
179
|
except Exception as e:
|
|
179
180
|
logger.error(f"Error retrieving page {page_id}: {e}")
|
|
@@ -210,6 +211,7 @@ class NotionIndexer(Indexer):
|
|
|
210
211
|
source_identifiers=source_identifiers,
|
|
211
212
|
metadata=metadata,
|
|
212
213
|
additional_metadata=additional_metadata,
|
|
214
|
+
display_name=source_identifiers.fullpath,
|
|
213
215
|
)
|
|
214
216
|
except Exception as e:
|
|
215
217
|
logger.error(f"Error retrieving database {database_id}: {e}")
|
|
@@ -149,11 +149,11 @@ class OutlookIndexer(Indexer):
|
|
|
149
149
|
|
|
150
150
|
def _message_to_file_data(self, message: "Message") -> FileData:
|
|
151
151
|
fullpath = self._generate_fullpath(message)
|
|
152
|
-
|
|
152
|
+
source_identifiers = SourceIdentifiers(filename=fullpath.name, fullpath=str(fullpath))
|
|
153
153
|
return FileData(
|
|
154
154
|
identifier=message.id,
|
|
155
155
|
connector_type=CONNECTOR_TYPE,
|
|
156
|
-
source_identifiers=
|
|
156
|
+
source_identifiers=source_identifiers,
|
|
157
157
|
metadata=FileDataSourceMetadata(
|
|
158
158
|
url=message.resource_url,
|
|
159
159
|
version=message.change_key,
|
|
@@ -178,6 +178,7 @@ class OutlookIndexer(Indexer):
|
|
|
178
178
|
"has_attachments": message.has_attachments,
|
|
179
179
|
"importance": message.importance,
|
|
180
180
|
},
|
|
181
|
+
display_name=source_identifiers.fullpath,
|
|
181
182
|
)
|
|
182
183
|
|
|
183
184
|
def _generate_fullpath(self, message: "Message") -> Path:
|
|
@@ -182,14 +182,15 @@ class SalesforceIndexer(Indexer):
|
|
|
182
182
|
record_with_extension = record["Id"] + self.get_file_extension(
|
|
183
183
|
record["attributes"]["type"]
|
|
184
184
|
)
|
|
185
|
+
source_identifiers = SourceIdentifiers(
|
|
186
|
+
filename=record_with_extension,
|
|
187
|
+
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
188
|
+
)
|
|
185
189
|
files_list.append(
|
|
186
190
|
FileData(
|
|
187
191
|
connector_type=CONNECTOR_TYPE,
|
|
188
192
|
identifier=record["Id"],
|
|
189
|
-
source_identifiers=
|
|
190
|
-
filename=record_with_extension,
|
|
191
|
-
fullpath=f"{record['attributes']['type']}/{record_with_extension}",
|
|
192
|
-
),
|
|
193
|
+
source_identifiers=source_identifiers,
|
|
193
194
|
metadata=FileDataSourceMetadata(
|
|
194
195
|
url=record["attributes"]["url"],
|
|
195
196
|
version=str(parser.parse(record["SystemModstamp"]).timestamp()),
|
|
@@ -200,6 +201,7 @@ class SalesforceIndexer(Indexer):
|
|
|
200
201
|
record_locator={"id": record["Id"]},
|
|
201
202
|
),
|
|
202
203
|
additional_metadata={"record_type": record["attributes"]["type"]},
|
|
204
|
+
display_name=source_identifiers.fullpath,
|
|
203
205
|
)
|
|
204
206
|
)
|
|
205
207
|
except SalesforceMalformedRequest as e:
|
|
@@ -122,12 +122,13 @@ class SlackIndexer(Indexer):
|
|
|
122
122
|
identifier = hashlib.sha256(identifier_base.encode("utf-8")).hexdigest()
|
|
123
123
|
filename = identifier[:16]
|
|
124
124
|
|
|
125
|
+
source_identifiers = SourceIdentifiers(
|
|
126
|
+
filename=f"{filename}.xml", fullpath=f"{filename}.xml"
|
|
127
|
+
)
|
|
125
128
|
return FileData(
|
|
126
129
|
identifier=identifier,
|
|
127
130
|
connector_type=CONNECTOR_TYPE,
|
|
128
|
-
source_identifiers=
|
|
129
|
-
filename=f"{filename}.xml", fullpath=f"{filename}.xml"
|
|
130
|
-
),
|
|
131
|
+
source_identifiers=source_identifiers,
|
|
131
132
|
metadata=FileDataSourceMetadata(
|
|
132
133
|
date_created=ts_oldest,
|
|
133
134
|
date_modified=ts_newest,
|
|
@@ -138,6 +139,7 @@ class SlackIndexer(Indexer):
|
|
|
138
139
|
"latest": ts_newest,
|
|
139
140
|
},
|
|
140
141
|
),
|
|
142
|
+
display_name=source_identifiers.fullpath,
|
|
141
143
|
)
|
|
142
144
|
|
|
143
145
|
@SourceConnectionError.wrap
|
|
@@ -130,7 +130,13 @@ class SQLIndexer(Indexer, ABC):
|
|
|
130
130
|
(len(ids) + self.index_config.batch_size - 1) // self.index_config.batch_size
|
|
131
131
|
)
|
|
132
132
|
]
|
|
133
|
+
|
|
133
134
|
for batch in id_batches:
|
|
135
|
+
batch_items = [BatchItem(identifier=str(b)) for b in batch]
|
|
136
|
+
display_name = (
|
|
137
|
+
f"{self.index_config.table_name}-{self.index_config.id_column}"
|
|
138
|
+
f"-[{batch_items[0].identifier}..{batch_items[-1].identifier}]"
|
|
139
|
+
)
|
|
134
140
|
# Make sure the hash is always a positive number to create identified
|
|
135
141
|
yield SqlBatchFileData(
|
|
136
142
|
connector_type=self.connector_type,
|
|
@@ -140,7 +146,8 @@ class SQLIndexer(Indexer, ABC):
|
|
|
140
146
|
additional_metadata=SqlAdditionalMetadata(
|
|
141
147
|
table_name=self.index_config.table_name, id_column=self.index_config.id_column
|
|
142
148
|
),
|
|
143
|
-
batch_items=
|
|
149
|
+
batch_items=batch_items,
|
|
150
|
+
display_name=display_name,
|
|
144
151
|
)
|
|
145
152
|
|
|
146
153
|
|
|
@@ -86,12 +86,13 @@ class ZendeskIndexer(Indexer):
|
|
|
86
86
|
async def get_tickets(self) -> AsyncGenerator[ZendeskFileData, None]:
|
|
87
87
|
async with self.connection_config.get_client() as client:
|
|
88
88
|
async for ticket in client.get_tickets():
|
|
89
|
+
source_identifiers = SourceIdentifiers(
|
|
90
|
+
filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
|
|
91
|
+
)
|
|
89
92
|
yield ZendeskFileData(
|
|
90
93
|
identifier=str(ticket.id),
|
|
91
94
|
connector_type=self.connector_type,
|
|
92
|
-
source_identifiers=
|
|
93
|
-
filename=f"{ticket.id}.txt", fullpath=f"tickets/{ticket.id}.txt"
|
|
94
|
-
),
|
|
95
|
+
source_identifiers=source_identifiers,
|
|
95
96
|
additional_metadata=ZendeskAdditionalMetadata(
|
|
96
97
|
item_type="ticket", content=ticket
|
|
97
98
|
),
|
|
@@ -101,17 +102,19 @@ class ZendeskIndexer(Indexer):
|
|
|
101
102
|
date_modified=ticket.updated_at.isoformat() if ticket.updated_at else None,
|
|
102
103
|
date_processed=str(time()),
|
|
103
104
|
),
|
|
105
|
+
display_name=source_identifiers.fullpath,
|
|
104
106
|
)
|
|
105
107
|
|
|
106
108
|
async def get_articles(self) -> AsyncGenerator[ZendeskFileData, None]:
|
|
107
109
|
async with self.connection_config.get_client() as client:
|
|
108
110
|
async for article in client.get_articles():
|
|
111
|
+
source_identifiers = SourceIdentifiers(
|
|
112
|
+
filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
|
|
113
|
+
)
|
|
109
114
|
yield ZendeskFileData(
|
|
110
115
|
identifier=str(article.id),
|
|
111
116
|
connector_type=self.connector_type,
|
|
112
|
-
source_identifiers=
|
|
113
|
-
filename=f"{article.id}.html", fullpath=f"articles/{article.id}.html"
|
|
114
|
-
),
|
|
117
|
+
source_identifiers=source_identifiers,
|
|
115
118
|
additional_metadata=ZendeskAdditionalMetadata(
|
|
116
119
|
item_type="article", content=article
|
|
117
120
|
),
|
|
@@ -123,6 +126,7 @@ class ZendeskIndexer(Indexer):
|
|
|
123
126
|
),
|
|
124
127
|
date_processed=str(time()),
|
|
125
128
|
),
|
|
129
|
+
display_name=source_identifiers.fullpath,
|
|
126
130
|
)
|
|
127
131
|
|
|
128
132
|
async def run_async(self, **kwargs: Any) -> AsyncGenerator[ZendeskFileData, None]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: unstructured_ingest
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.44
|
|
4
4
|
Summary: Local ETL data pipeline to get data RAG ready
|
|
5
5
|
Author-email: Unstructured Technologies <devops@unstructuredai.io>
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -60,6 +60,7 @@ Provides-Extra: delta-table
|
|
|
60
60
|
Requires-Dist: boto3; extra == 'delta-table'
|
|
61
61
|
Requires-Dist: deltalake; extra == 'delta-table'
|
|
62
62
|
Requires-Dist: pandas; extra == 'delta-table'
|
|
63
|
+
Requires-Dist: tenacity; extra == 'delta-table'
|
|
63
64
|
Provides-Extra: discord
|
|
64
65
|
Requires-Dist: discord-py; extra == 'discord'
|
|
65
66
|
Provides-Extra: doc
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
unstructured_ingest/__init__.py,sha256=U4S_2y3zgLZVfMenHRaJFBW8yqh2mUBuI291LGQVOJ8,35
|
|
2
|
-
unstructured_ingest/__version__.py,sha256=
|
|
2
|
+
unstructured_ingest/__version__.py,sha256=12SSwrWI8zU57pbaRSeJH9dGmuvWZXi056-PfBAhJTw,43
|
|
3
3
|
unstructured_ingest/error.py,sha256=qDncnJgbf5ils956RcO2CGlAKYDT5OaEM9Clv1JVTNc,1448
|
|
4
4
|
unstructured_ingest/errors_v2.py,sha256=9RuRCi7lbDxCguDz07y5RiHoQiFIOWwOD7xqzJ2B3Yw,436
|
|
5
5
|
unstructured_ingest/logger.py,sha256=7e_7UeK6hVOd5BQ6i9NzRUAPCS_DF839Y8TjUDywraY,1428
|
|
@@ -29,7 +29,7 @@ unstructured_ingest/embed/interfaces.py,sha256=Y3PLhgWnMDmtpugE37hlAiBIbC8izrFFX
|
|
|
29
29
|
unstructured_ingest/embed/mixedbreadai.py,sha256=uKTqzoi4M_WeYZu-qc_TSxwJONOESzxVbBLUbD1Wbns,3922
|
|
30
30
|
unstructured_ingest/embed/octoai.py,sha256=yZuD7R4mEKS4Jjyae_IrNWogMPOFFS8gW5oUllj3ROU,4540
|
|
31
31
|
unstructured_ingest/embed/openai.py,sha256=TMEOPVfm_OSs4tb3Ymd6q5J49R_-YKvO4TOqCHb3bwk,4647
|
|
32
|
-
unstructured_ingest/embed/togetherai.py,sha256=
|
|
32
|
+
unstructured_ingest/embed/togetherai.py,sha256=ykaveEUBxBGBzRlmWc9utCFQuUWHdbW4F9KAb-uBAJM,3630
|
|
33
33
|
unstructured_ingest/embed/vertexai.py,sha256=DphvPhiYdXTMrQxJCd-64vMs4iVdLY_BphHqz3n5HfM,3758
|
|
34
34
|
unstructured_ingest/embed/voyageai.py,sha256=EOrYzaoXOZ6C4fNkMlCgb8KA8rdfgVXN3USMFpnn0Bs,4698
|
|
35
35
|
unstructured_ingest/interfaces/__init__.py,sha256=QIkWqjsq9INTa89gPuXlMlQL4s3y5TqLmPkuVuTyXcs,795
|
|
@@ -62,37 +62,37 @@ unstructured_ingest/processes/filter.py,sha256=oc3SYukRYfzx8sdJqF3KxdwZcrA-1U8PT
|
|
|
62
62
|
unstructured_ingest/processes/partitioner.py,sha256=Kn_BSFYvOkwo8fqThw_cOpgD0Um-AdoSqclZplcdNBA,10109
|
|
63
63
|
unstructured_ingest/processes/uncompress.py,sha256=o9JL3Bza4KPUTmrB39-v_5SuK_fYwhwFAhjQi2Pm8h8,2426
|
|
64
64
|
unstructured_ingest/processes/connectors/__init__.py,sha256=cR4ZH2dpPod7QR6OsgMx8X9kpFcEc1TVfQndUNoKGzI,6812
|
|
65
|
-
unstructured_ingest/processes/connectors/airtable.py,sha256=
|
|
66
|
-
unstructured_ingest/processes/connectors/astradb.py,sha256=
|
|
65
|
+
unstructured_ingest/processes/connectors/airtable.py,sha256=dDZDKim8ON0yMHv-7cxutjllV4iM9x0RZg0yfP2wQpM,9063
|
|
66
|
+
unstructured_ingest/processes/connectors/astradb.py,sha256=qi9G3s88GYSV3TXNrbcO0n32SuxO-uagtUIodjgyKVU,19216
|
|
67
67
|
unstructured_ingest/processes/connectors/azure_ai_search.py,sha256=szhSRXzUHk0DE2hGFfjGc_jNFzlUwiRlCtIkuu7tmnk,11524
|
|
68
68
|
unstructured_ingest/processes/connectors/chroma.py,sha256=q5_Fu4xb6_W_NyrPxVa3-jVwZLqVdlBNlR4dFvbd7l0,7235
|
|
69
|
-
unstructured_ingest/processes/connectors/confluence.py,sha256=
|
|
69
|
+
unstructured_ingest/processes/connectors/confluence.py,sha256=aA2B_FPdAjlVAJtmMldYu6lld2sR-6JL5tWh7yItiwg,22828
|
|
70
70
|
unstructured_ingest/processes/connectors/couchbase.py,sha256=KCHoYDNya9B05NIB5D78zXoizFyfpJRepcYBe1nLSOs,12298
|
|
71
|
-
unstructured_ingest/processes/connectors/delta_table.py,sha256=
|
|
72
|
-
unstructured_ingest/processes/connectors/discord.py,sha256=
|
|
71
|
+
unstructured_ingest/processes/connectors/delta_table.py,sha256=Y3yJPfwTyDdv7dqn54ZLZ4DBjg9OF2rXuUaNfbPCkvc,9993
|
|
72
|
+
unstructured_ingest/processes/connectors/discord.py,sha256=CD-SBECMdr3pnmqbPvBMyPU2cBroXUhyW6F7L3laP6A,5348
|
|
73
73
|
unstructured_ingest/processes/connectors/github.py,sha256=smHCz6jOH1p_hW2S25bYunBBj_pYjz8HTw6wkzaJz_A,7765
|
|
74
|
-
unstructured_ingest/processes/connectors/gitlab.py,sha256=
|
|
74
|
+
unstructured_ingest/processes/connectors/gitlab.py,sha256=Fdq6_lk-By1JDmLGVjoKJkaHESiKTZsbvoHhMsljlE0,10114
|
|
75
75
|
unstructured_ingest/processes/connectors/google_drive.py,sha256=jQb4_rKL_tJg7s7m-H8nrvc0GKwxiubtg8KL3-ZIGPM,35304
|
|
76
|
-
unstructured_ingest/processes/connectors/jira.py,sha256=
|
|
76
|
+
unstructured_ingest/processes/connectors/jira.py,sha256=BuZwExmdcI-R_MGPUwm8TnFh2jEjjwkyA1T51Bgqh-U,18558
|
|
77
77
|
unstructured_ingest/processes/connectors/kdbai.py,sha256=XhxYpKSAoFPBsDQWwNuLX03DCxOVr7yquj9VYM55Rtc,5174
|
|
78
|
-
unstructured_ingest/processes/connectors/local.py,sha256=
|
|
78
|
+
unstructured_ingest/processes/connectors/local.py,sha256=CesMduUiSPqdJpqIyW28icGvGAo4hfa-4fzbYajmMSo,7450
|
|
79
79
|
unstructured_ingest/processes/connectors/milvus.py,sha256=L-PM5osheNyNsLGYZmiF3rRmeulp7Ejk92JCoaQ_F9Y,12075
|
|
80
|
-
unstructured_ingest/processes/connectors/mongodb.py,sha256=
|
|
80
|
+
unstructured_ingest/processes/connectors/mongodb.py,sha256=OmbbmE_pSDVjrn1YfjrQMTTs6JhTOJUU5d_jULxgtaM,14545
|
|
81
81
|
unstructured_ingest/processes/connectors/neo4j.py,sha256=ztxvI9KY8RF5kYUuMGSzzN5mz7Fu_4Ai9P7dqCpJLc0,20267
|
|
82
|
-
unstructured_ingest/processes/connectors/onedrive.py,sha256=
|
|
83
|
-
unstructured_ingest/processes/connectors/outlook.py,sha256=
|
|
82
|
+
unstructured_ingest/processes/connectors/onedrive.py,sha256=nZt6hsFMlURgB5-BioFBzJySieRVU8xi99QhOCtorxQ,19343
|
|
83
|
+
unstructured_ingest/processes/connectors/outlook.py,sha256=6HHubZI_zttEfYp0XNd4Y1vhjsS8uSg7aZ2LBrTjfHk,9376
|
|
84
84
|
unstructured_ingest/processes/connectors/pinecone.py,sha256=jCabAqKQyBFzaGjphxLMr57y7P0Z15Jd9Jj-JM40YnU,15090
|
|
85
85
|
unstructured_ingest/processes/connectors/redisdb.py,sha256=rTihbfv0Mlk1eo5Izn-JXRu5Ad5C-KD58nSqeKsaZJ8,8024
|
|
86
|
-
unstructured_ingest/processes/connectors/salesforce.py,sha256=
|
|
86
|
+
unstructured_ingest/processes/connectors/salesforce.py,sha256=N_UoebrhzXZNWw-X7lg8_qAziXx5L_d8XHnHWKNNYR8,11767
|
|
87
87
|
unstructured_ingest/processes/connectors/sharepoint.py,sha256=vIfLIactYXcdetccHvKlYOay6NOzGj2X0CkXbY0KuRo,6213
|
|
88
|
-
unstructured_ingest/processes/connectors/slack.py,sha256=
|
|
88
|
+
unstructured_ingest/processes/connectors/slack.py,sha256=oboIfX7ayBMK0te5Nv50iyL3FQJFXJbRxZSQaCMp3kM,9318
|
|
89
89
|
unstructured_ingest/processes/connectors/utils.py,sha256=TAd0hb1f291N-q7-TUe6JKSCGkhqDyo7Ij8zmliBZUc,2071
|
|
90
90
|
unstructured_ingest/processes/connectors/vectara.py,sha256=xrC6jkgW8BII4UjdzUelDu122xT484cpfMTK2wl-sko,12292
|
|
91
91
|
unstructured_ingest/processes/connectors/assets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
92
92
|
unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql,sha256=8a9HTcRWA6IuswSD632b_uZSO6Dax_0rUYnflqktcek,226
|
|
93
93
|
unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json,sha256=SJlIO0kXxy866tWQ8bEzvwLwflsoUMIS-OKlxMvHIuE,504
|
|
94
94
|
unstructured_ingest/processes/connectors/databricks/__init__.py,sha256=RtKAPyNtXh6fzEsOQ08pA0-vC1uMr3KqYG6cqiBoo70,2133
|
|
95
|
-
unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=
|
|
95
|
+
unstructured_ingest/processes/connectors/databricks/volumes.py,sha256=EltntY0i9t7N7__ePfEUanWO9wLy_gxNd48KXz1TxUw,8373
|
|
96
96
|
unstructured_ingest/processes/connectors/databricks/volumes_aws.py,sha256=WhGTp6aRTLSdc4GChCL4mz2b-IanderW8j1IqezX6YA,2958
|
|
97
97
|
unstructured_ingest/processes/connectors/databricks/volumes_azure.py,sha256=pF2d6uAIbwJJUeOIG5xknUMCGc5d9Aztmc2776wp-a0,3740
|
|
98
98
|
unstructured_ingest/processes/connectors/databricks/volumes_gcp.py,sha256=y9AvVl6PtnIxlTlrPj_wyHBDBRJNq3uoTOuZwTryNg8,2994
|
|
@@ -103,7 +103,7 @@ unstructured_ingest/processes/connectors/duckdb/base.py,sha256=bTLhilg6mgERNCpee
|
|
|
103
103
|
unstructured_ingest/processes/connectors/duckdb/duckdb.py,sha256=jsmibTd_yvYzkCT05HhCJvplyobtjfNILC3zyTuCcVY,4464
|
|
104
104
|
unstructured_ingest/processes/connectors/duckdb/motherduck.py,sha256=Atr2MjJQGFGWh5aeiQsLpUbFw-aCZH-ABI1LprDh5VI,4727
|
|
105
105
|
unstructured_ingest/processes/connectors/elasticsearch/__init__.py,sha256=M8mmBWoP6J5R3hxg6BQUMexYlTUxUxdBoIcjUop8yt8,826
|
|
106
|
-
unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=
|
|
106
|
+
unstructured_ingest/processes/connectors/elasticsearch/elasticsearch.py,sha256=iCC4AP5s8YYa8sMldTFcHp9sfUK1LdQTD0oqXnvklwM,19305
|
|
107
107
|
unstructured_ingest/processes/connectors/elasticsearch/opensearch.py,sha256=wggHvw8h-X0-3WPNxj9rt2xkrE7Pv7CV0B0KzTMzBB4,6944
|
|
108
108
|
unstructured_ingest/processes/connectors/fsspec/__init__.py,sha256=3HTdw4L4mdN4W8UX0jQbMxBg0ZfITPbEXU7Bwdo1BfI,1843
|
|
109
109
|
unstructured_ingest/processes/connectors/fsspec/azure.py,sha256=31VNiG5YnXfhrFX7QJ2O1ubeWHxbe1sYVIztefbscAQ,7148
|
|
@@ -129,7 +129,7 @@ unstructured_ingest/processes/connectors/lancedb/lancedb.py,sha256=qyco2ZPcE-MqE
|
|
|
129
129
|
unstructured_ingest/processes/connectors/lancedb/local.py,sha256=rhRxoK-h1Q0wdRhUq8Y5y48fbkvvCcIbA4gZvtteHq4,1263
|
|
130
130
|
unstructured_ingest/processes/connectors/notion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
131
|
unstructured_ingest/processes/connectors/notion/client.py,sha256=wmlkbuER2crKjrqYm1dJwrCe8qH9gX-R4yckg5GQ41I,13174
|
|
132
|
-
unstructured_ingest/processes/connectors/notion/connector.py,sha256=
|
|
132
|
+
unstructured_ingest/processes/connectors/notion/connector.py,sha256=WdhnB9vZs5nenQJ-DNx4SV7p2-jcQVp3Fe6nxS7Y9SI,13426
|
|
133
133
|
unstructured_ingest/processes/connectors/notion/helpers.py,sha256=Z4qjdsdFyrgE0KwE8gDZdZ88LsP_NYQit697Po6w878,16424
|
|
134
134
|
unstructured_ingest/processes/connectors/notion/interfaces.py,sha256=SrTT-9c0nvk0fMqVgudYF647r04AdMKi6wkIkMy7Szw,563
|
|
135
135
|
unstructured_ingest/processes/connectors/notion/ingest_backoff/__init__.py,sha256=cfdIJuZDFcF3w84sTyYqZ8vXnSMfMABXFc100r3g5kU,63
|
|
@@ -207,7 +207,7 @@ unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py,sha256=_
|
|
|
207
207
|
unstructured_ingest/processes/connectors/sql/postgres.py,sha256=kDIL8Cj45EDpKqit1_araRpP4v3cb__QbYqoINg9f2k,5403
|
|
208
208
|
unstructured_ingest/processes/connectors/sql/singlestore.py,sha256=B46lpvyAj1AArpACi9MXbXD1-52zF6Dsj3RJtD1g4r0,5955
|
|
209
209
|
unstructured_ingest/processes/connectors/sql/snowflake.py,sha256=dkGIFz_VIVhew_FjbuO8r3cVluw7VIUdvV6VjkAItP8,11369
|
|
210
|
-
unstructured_ingest/processes/connectors/sql/sql.py,sha256=
|
|
210
|
+
unstructured_ingest/processes/connectors/sql/sql.py,sha256=jIwAck_vFlsMczH7BOyI-iZC_lrLAV-1eqmGtKkPNQc,16170
|
|
211
211
|
unstructured_ingest/processes/connectors/sql/sqlite.py,sha256=V3OfRrXGGhTa_R2FPA-ysn95HHCv9x_VEBKVDsSGsbs,5549
|
|
212
212
|
unstructured_ingest/processes/connectors/sql/vastdb.py,sha256=trhvUBumDmj2rLjmxFBKw9L9wF6ZpssF0wfmRaG97H0,9803
|
|
213
213
|
unstructured_ingest/processes/connectors/weaviate/__init__.py,sha256=1Vnz8hm_Cf3NkQUTz5ZD4QkbLSVql4UvRoY2j2FnC9k,853
|
|
@@ -217,7 +217,7 @@ unstructured_ingest/processes/connectors/weaviate/local.py,sha256=4fgZsL9dgnWuaS
|
|
|
217
217
|
unstructured_ingest/processes/connectors/weaviate/weaviate.py,sha256=yB67gxvo3X0UaP_mNeB0HbSWXst7ur0E2QKwLA0gIS4,13647
|
|
218
218
|
unstructured_ingest/processes/connectors/zendesk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
219
219
|
unstructured_ingest/processes/connectors/zendesk/client.py,sha256=GvPIpx4aYdD58-edHgvCFjFao94uR0O5Yf4dT9NCmSk,11952
|
|
220
|
-
unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=
|
|
220
|
+
unstructured_ingest/processes/connectors/zendesk/zendesk.py,sha256=doS6d7ZhXBgJN8aPVf7vnQr8BciQbzX8-4yl4_hDZ7w,9253
|
|
221
221
|
unstructured_ingest/processes/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
222
222
|
unstructured_ingest/processes/utils/blob_storage.py,sha256=apMUmm9loxdbTRkkLH4VhG9kUVyiw9PFUJheSDxSxPk,1023
|
|
223
223
|
unstructured_ingest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -231,8 +231,8 @@ unstructured_ingest/utils/ndjson.py,sha256=nz8VUOPEgAFdhaDOpuveknvCU4x82fVwqE01q
|
|
|
231
231
|
unstructured_ingest/utils/pydantic_models.py,sha256=BT_j15e4rX40wQbt8LUXbqfPhA3rJn1PHTI_G_A_EHY,1720
|
|
232
232
|
unstructured_ingest/utils/string_and_date_utils.py,sha256=oXOI6rxXq-8ncbk7EoJK0WCcTXWj75EzKl8pfQMID3U,2522
|
|
233
233
|
unstructured_ingest/utils/table.py,sha256=WZechczgVFvlodUWFcsnCGvBNh1xRm6hr0VbJTPxKAc,3669
|
|
234
|
-
unstructured_ingest-1.0.
|
|
235
|
-
unstructured_ingest-1.0.
|
|
236
|
-
unstructured_ingest-1.0.
|
|
237
|
-
unstructured_ingest-1.0.
|
|
238
|
-
unstructured_ingest-1.0.
|
|
234
|
+
unstructured_ingest-1.0.44.dist-info/METADATA,sha256=PR_LHUUQP-2oayEmsoTGblqWKPmJt46QtijI7y-zni0,8795
|
|
235
|
+
unstructured_ingest-1.0.44.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
236
|
+
unstructured_ingest-1.0.44.dist-info/entry_points.txt,sha256=gUAAFnjFPnBgThJSEbw0N5ZjxtaKlT1s9e05_arQrNw,70
|
|
237
|
+
unstructured_ingest-1.0.44.dist-info/licenses/LICENSE.md,sha256=SxkKP_62uIAKb9mb1eH7FH4Kn2aYT09fgjKpJt5PyTk,11360
|
|
238
|
+
unstructured_ingest-1.0.44.dist-info/RECORD,,
|
|
File without changes
|
{unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{unstructured_ingest-1.0.40.dist-info → unstructured_ingest-1.0.44.dist-info}/licenses/LICENSE.md
RENAMED
|
File without changes
|