unstructured-ingest 0.5.25__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/databricks/test_volumes_native.py +1 -1
- test/integration/connectors/duckdb/test_duckdb.py +1 -1
- test/integration/connectors/duckdb/test_motherduck.py +1 -1
- test/integration/connectors/elasticsearch/test_elasticsearch.py +1 -1
- test/integration/connectors/elasticsearch/test_opensearch.py +1 -1
- test/integration/connectors/sql/test_databricks_delta_tables.py +1 -1
- test/integration/connectors/sql/test_postgres.py +1 -1
- test/integration/connectors/sql/test_singlestore.py +1 -1
- test/integration/connectors/sql/test_snowflake.py +1 -1
- test/integration/connectors/sql/test_sqlite.py +1 -1
- test/integration/connectors/test_astradb.py +1 -1
- test/integration/connectors/test_azure_ai_search.py +1 -1
- test/integration/connectors/test_chroma.py +1 -1
- test/integration/connectors/test_delta_table.py +1 -1
- test/integration/connectors/test_lancedb.py +1 -1
- test/integration/connectors/test_milvus.py +1 -1
- test/integration/connectors/test_mongodb.py +1 -1
- test/integration/connectors/test_neo4j.py +5 -5
- test/integration/connectors/test_onedrive.py +1 -1
- test/integration/connectors/test_pinecone.py +1 -1
- test/integration/connectors/test_qdrant.py +1 -1
- test/integration/connectors/test_redis.py +1 -1
- test/integration/connectors/test_s3.py +1 -1
- test/integration/connectors/test_vectara.py +1 -1
- test/integration/connectors/utils/validation/destination.py +2 -1
- test/integration/connectors/utils/validation/source.py +2 -1
- test/integration/connectors/weaviate/test_local.py +1 -1
- test/unit/test_html.py +1 -1
- test/unit/v2/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +1 -1
- test/unit/v2/connectors/motherduck/test_base.py +1 -2
- test/unit/v2/connectors/sql/test_sql.py +1 -1
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/html.py +2 -1
- unstructured_ingest/v2/interfaces/__init__.py +0 -13
- unstructured_ingest/v2/interfaces/downloader.py +1 -1
- unstructured_ingest/v2/interfaces/indexer.py +1 -1
- unstructured_ingest/v2/interfaces/upload_stager.py +2 -2
- unstructured_ingest/v2/interfaces/uploader.py +2 -3
- unstructured_ingest/v2/pipeline/steps/chunk.py +1 -2
- unstructured_ingest/v2/pipeline/steps/download.py +2 -3
- unstructured_ingest/v2/pipeline/steps/embed.py +1 -2
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/partition.py +1 -2
- unstructured_ingest/v2/pipeline/steps/stage.py +2 -2
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -2
- unstructured_ingest/v2/processes/chunker.py +5 -2
- unstructured_ingest/v2/processes/connectors/airtable.py +1 -2
- unstructured_ingest/v2/processes/connectors/astradb.py +7 -5
- unstructured_ingest/v2/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/v2/processes/connectors/chroma.py +1 -1
- unstructured_ingest/v2/processes/connectors/confluence.py +5 -3
- unstructured_ingest/v2/processes/connectors/couchbase.py +7 -5
- unstructured_ingest/v2/processes/connectors/databricks/volumes.py +5 -3
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +2 -1
- unstructured_ingest/v2/processes/connectors/delta_table.py +1 -1
- unstructured_ingest/v2/processes/connectors/discord.py +5 -3
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -1
- unstructured_ingest/v2/processes/connectors/duckdb/duckdb.py +1 -1
- unstructured_ingest/v2/processes/connectors/duckdb/motherduck.py +1 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch/elasticsearch.py +7 -5
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +5 -3
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +3 -3
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +1 -1
- unstructured_ingest/v2/processes/connectors/gitlab.py +5 -3
- unstructured_ingest/v2/processes/connectors/google_drive.py +5 -3
- unstructured_ingest/v2/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +1 -1
- unstructured_ingest/v2/processes/connectors/jira.py +5 -3
- unstructured_ingest/v2/processes/connectors/kafka/kafka.py +5 -3
- unstructured_ingest/v2/processes/connectors/kdbai.py +1 -1
- unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +8 -4
- unstructured_ingest/v2/processes/connectors/local.py +5 -3
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/mongodb.py +7 -5
- unstructured_ingest/v2/processes/connectors/neo4j.py +1 -1
- unstructured_ingest/v2/processes/connectors/notion/connector.py +5 -3
- unstructured_ingest/v2/processes/connectors/onedrive.py +5 -3
- unstructured_ingest/v2/processes/connectors/outlook.py +5 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +1 -1
- unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +1 -1
- unstructured_ingest/v2/processes/connectors/redisdb.py +1 -1
- unstructured_ingest/v2/processes/connectors/salesforce.py +5 -3
- unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -3
- unstructured_ingest/v2/processes/connectors/slack.py +2 -2
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +7 -5
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +3 -3
- unstructured_ingest/v2/processes/connectors/vectara.py +1 -1
- unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +1 -1
- unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +5 -3
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/uncompress.py +1 -1
- unstructured_ingest/v2/processes/utils/blob_storage.py +2 -1
- unstructured_ingest/v2/utils.py +1 -1
- {unstructured_ingest-0.5.25.dist-info → unstructured_ingest-0.6.1.dist-info}/METADATA +16 -16
- {unstructured_ingest-0.5.25.dist-info → unstructured_ingest-0.6.1.dist-info}/RECORD +105 -106
- unstructured_ingest/v2/interfaces/file_data.py +0 -13
- {unstructured_ingest-0.5.25.dist-info → unstructured_ingest-0.6.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.5.25.dist-info → unstructured_ingest-0.6.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.5.25.dist-info → unstructured_ingest-0.6.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.5.25.dist-info → unstructured_ingest-0.6.1.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,6 @@ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
|
13
13
|
from unstructured_ingest.v2.interfaces import (
|
|
14
14
|
AccessConfig,
|
|
15
15
|
ConnectionConfig,
|
|
16
|
-
FileData,
|
|
17
16
|
Uploader,
|
|
18
17
|
UploaderConfig,
|
|
19
18
|
UploadStager,
|
|
@@ -23,6 +22,7 @@ from unstructured_ingest.v2.logger import logger
|
|
|
23
22
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
24
23
|
DestinationRegistryEntry,
|
|
25
24
|
)
|
|
25
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
26
26
|
|
|
27
27
|
if TYPE_CHECKING:
|
|
28
28
|
from pymilvus import MilvusClient
|
|
@@ -13,17 +13,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
13
13
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
14
14
|
from unstructured_ingest.v2.interfaces import (
|
|
15
15
|
AccessConfig,
|
|
16
|
-
BatchFileData,
|
|
17
|
-
BatchItem,
|
|
18
16
|
ConnectionConfig,
|
|
19
17
|
Downloader,
|
|
20
18
|
DownloaderConfig,
|
|
21
19
|
DownloadResponse,
|
|
22
|
-
FileData,
|
|
23
|
-
FileDataSourceMetadata,
|
|
24
20
|
Indexer,
|
|
25
21
|
IndexerConfig,
|
|
26
|
-
SourceIdentifiers,
|
|
27
22
|
Uploader,
|
|
28
23
|
UploaderConfig,
|
|
29
24
|
download_responses,
|
|
@@ -33,6 +28,13 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
33
28
|
DestinationRegistryEntry,
|
|
34
29
|
SourceRegistryEntry,
|
|
35
30
|
)
|
|
31
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
32
|
+
BatchFileData,
|
|
33
|
+
BatchItem,
|
|
34
|
+
FileData,
|
|
35
|
+
FileDataSourceMetadata,
|
|
36
|
+
SourceIdentifiers,
|
|
37
|
+
)
|
|
36
38
|
|
|
37
39
|
if TYPE_CHECKING:
|
|
38
40
|
from pymongo import MongoClient
|
|
@@ -18,7 +18,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
18
18
|
from unstructured_ingest.v2.interfaces import (
|
|
19
19
|
AccessConfig,
|
|
20
20
|
ConnectionConfig,
|
|
21
|
-
FileData,
|
|
22
21
|
Uploader,
|
|
23
22
|
UploaderConfig,
|
|
24
23
|
UploadStager,
|
|
@@ -28,6 +27,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
28
27
|
DestinationRegistryEntry,
|
|
29
28
|
)
|
|
30
29
|
from unstructured_ingest.v2.processes.connectors.utils import format_and_truncate_orig_elements
|
|
30
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
31
31
|
|
|
32
32
|
SimilarityFunction = Literal["cosine"]
|
|
33
33
|
|
|
@@ -12,14 +12,16 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
12
12
|
Downloader,
|
|
13
13
|
DownloaderConfig,
|
|
14
14
|
DownloadResponse,
|
|
15
|
-
FileData,
|
|
16
|
-
FileDataSourceMetadata,
|
|
17
15
|
Indexer,
|
|
18
16
|
IndexerConfig,
|
|
19
|
-
SourceIdentifiers,
|
|
20
17
|
)
|
|
21
18
|
from unstructured_ingest.v2.logger import logger
|
|
22
19
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
20
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
21
|
+
FileData,
|
|
22
|
+
FileDataSourceMetadata,
|
|
23
|
+
SourceIdentifiers,
|
|
24
|
+
)
|
|
23
25
|
|
|
24
26
|
if TYPE_CHECKING:
|
|
25
27
|
from unstructured_ingest.v2.processes.connectors.notion.client import Client
|
|
@@ -22,11 +22,8 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
22
|
Downloader,
|
|
23
23
|
DownloaderConfig,
|
|
24
24
|
DownloadResponse,
|
|
25
|
-
FileData,
|
|
26
|
-
FileDataSourceMetadata,
|
|
27
25
|
Indexer,
|
|
28
26
|
IndexerConfig,
|
|
29
|
-
SourceIdentifiers,
|
|
30
27
|
Uploader,
|
|
31
28
|
UploaderConfig,
|
|
32
29
|
)
|
|
@@ -39,6 +36,11 @@ from unstructured_ingest.v2.processes.utils.blob_storage import (
|
|
|
39
36
|
BlobStoreUploadStager,
|
|
40
37
|
BlobStoreUploadStagerConfig,
|
|
41
38
|
)
|
|
39
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
40
|
+
FileData,
|
|
41
|
+
FileDataSourceMetadata,
|
|
42
|
+
SourceIdentifiers,
|
|
43
|
+
)
|
|
42
44
|
|
|
43
45
|
if TYPE_CHECKING:
|
|
44
46
|
from office365.graph_client import GraphClient
|
|
@@ -16,12 +16,15 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
16
|
Downloader,
|
|
17
17
|
DownloaderConfig,
|
|
18
18
|
DownloadResponse,
|
|
19
|
-
FileData,
|
|
20
19
|
Indexer,
|
|
21
20
|
IndexerConfig,
|
|
22
21
|
)
|
|
23
|
-
from unstructured_ingest.v2.interfaces.file_data import FileDataSourceMetadata, SourceIdentifiers
|
|
24
22
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
23
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
24
|
+
FileData,
|
|
25
|
+
FileDataSourceMetadata,
|
|
26
|
+
SourceIdentifiers,
|
|
27
|
+
)
|
|
25
28
|
|
|
26
29
|
MAX_EMAILS_PER_FOLDER = 1_000_000 # Maximum number of emails per folder
|
|
27
30
|
|
|
@@ -13,7 +13,6 @@ from unstructured_ingest.v2.errors import UserError
|
|
|
13
13
|
from unstructured_ingest.v2.interfaces import (
|
|
14
14
|
AccessConfig,
|
|
15
15
|
ConnectionConfig,
|
|
16
|
-
FileData,
|
|
17
16
|
UploaderConfig,
|
|
18
17
|
UploadStager,
|
|
19
18
|
UploadStagerConfig,
|
|
@@ -21,6 +20,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
21
20
|
)
|
|
22
21
|
from unstructured_ingest.v2.logger import logger
|
|
23
22
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
23
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
24
24
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
25
25
|
|
|
26
26
|
if TYPE_CHECKING:
|
|
@@ -13,13 +13,13 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
13
13
|
from unstructured_ingest.v2.interfaces import (
|
|
14
14
|
AccessConfig,
|
|
15
15
|
ConnectionConfig,
|
|
16
|
-
FileData,
|
|
17
16
|
Uploader,
|
|
18
17
|
UploaderConfig,
|
|
19
18
|
UploadStager,
|
|
20
19
|
UploadStagerConfig,
|
|
21
20
|
)
|
|
22
21
|
from unstructured_ingest.v2.logger import logger
|
|
22
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
23
23
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
24
24
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
@@ -11,12 +11,12 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
11
11
|
from unstructured_ingest.v2.interfaces import (
|
|
12
12
|
AccessConfig,
|
|
13
13
|
ConnectionConfig,
|
|
14
|
-
FileData,
|
|
15
14
|
Uploader,
|
|
16
15
|
UploaderConfig,
|
|
17
16
|
)
|
|
18
17
|
from unstructured_ingest.v2.logger import logger
|
|
19
18
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
19
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
20
20
|
|
|
21
21
|
if TYPE_CHECKING:
|
|
22
22
|
from redis.asyncio import Redis
|
|
@@ -28,16 +28,18 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
28
28
|
Downloader,
|
|
29
29
|
DownloaderConfig,
|
|
30
30
|
DownloadResponse,
|
|
31
|
-
FileData,
|
|
32
|
-
FileDataSourceMetadata,
|
|
33
31
|
Indexer,
|
|
34
32
|
IndexerConfig,
|
|
35
|
-
SourceIdentifiers,
|
|
36
33
|
)
|
|
37
34
|
from unstructured_ingest.v2.logger import logger
|
|
38
35
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
39
36
|
SourceRegistryEntry,
|
|
40
37
|
)
|
|
38
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
39
|
+
FileData,
|
|
40
|
+
FileDataSourceMetadata,
|
|
41
|
+
SourceIdentifiers,
|
|
42
|
+
)
|
|
41
43
|
|
|
42
44
|
|
|
43
45
|
class MissingCategoryError(Exception):
|
|
@@ -11,9 +11,6 @@ from unstructured_ingest.error import (
|
|
|
11
11
|
SourceConnectionNetworkError,
|
|
12
12
|
)
|
|
13
13
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
14
|
-
from unstructured_ingest.v2.interfaces import (
|
|
15
|
-
FileData,
|
|
16
|
-
)
|
|
17
14
|
from unstructured_ingest.v2.logger import logger
|
|
18
15
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
19
16
|
SourceRegistryEntry,
|
|
@@ -26,6 +23,9 @@ from unstructured_ingest.v2.processes.connectors.onedrive import (
|
|
|
26
23
|
OnedriveIndexer,
|
|
27
24
|
OnedriveIndexerConfig,
|
|
28
25
|
)
|
|
26
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
27
|
+
FileData,
|
|
28
|
+
)
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from office365.onedrive.driveitems.driveItem import DriveItem
|
|
@@ -20,12 +20,12 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
20
20
|
Indexer,
|
|
21
21
|
IndexerConfig,
|
|
22
22
|
)
|
|
23
|
-
from unstructured_ingest.v2.
|
|
23
|
+
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
24
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
24
25
|
FileData,
|
|
25
26
|
FileDataSourceMetadata,
|
|
26
27
|
SourceIdentifiers,
|
|
27
28
|
)
|
|
28
|
-
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
29
29
|
|
|
30
30
|
if TYPE_CHECKING:
|
|
31
31
|
from slack_sdk import WebClient
|
|
@@ -7,7 +7,6 @@ from pydantic import Field, Secret
|
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
11
10
|
from unstructured_ingest.v2.logger import logger
|
|
12
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
12
|
DestinationRegistryEntry,
|
|
@@ -20,6 +19,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
20
19
|
SQLUploadStager,
|
|
21
20
|
SQLUploadStagerConfig,
|
|
22
21
|
)
|
|
22
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
23
23
|
|
|
24
24
|
if TYPE_CHECKING:
|
|
25
25
|
from databricks.sdk.core import oauth_service_principal
|
|
@@ -7,7 +7,6 @@ from pydantic import Field, Secret
|
|
|
7
7
|
|
|
8
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
|
-
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
11
10
|
from unstructured_ingest.v2.logger import logger
|
|
12
11
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
13
12
|
DestinationRegistryEntry,
|
|
@@ -28,6 +27,7 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
28
27
|
SQLUploadStagerConfig,
|
|
29
28
|
parse_date_string,
|
|
30
29
|
)
|
|
30
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
31
31
|
|
|
32
32
|
if TYPE_CHECKING:
|
|
33
33
|
from pandas import DataFrame
|
|
@@ -16,17 +16,12 @@ from unstructured_ingest.utils.data_prep import get_data, get_data_df, split_dat
|
|
|
16
16
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
17
17
|
from unstructured_ingest.v2.interfaces import (
|
|
18
18
|
AccessConfig,
|
|
19
|
-
BatchFileData,
|
|
20
|
-
BatchItem,
|
|
21
19
|
ConnectionConfig,
|
|
22
20
|
Downloader,
|
|
23
21
|
DownloaderConfig,
|
|
24
22
|
DownloadResponse,
|
|
25
|
-
FileData,
|
|
26
|
-
FileDataSourceMetadata,
|
|
27
23
|
Indexer,
|
|
28
24
|
IndexerConfig,
|
|
29
|
-
SourceIdentifiers,
|
|
30
25
|
Uploader,
|
|
31
26
|
UploaderConfig,
|
|
32
27
|
UploadStager,
|
|
@@ -34,6 +29,13 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
34
29
|
download_responses,
|
|
35
30
|
)
|
|
36
31
|
from unstructured_ingest.v2.logger import logger
|
|
32
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
33
|
+
BatchFileData,
|
|
34
|
+
BatchItem,
|
|
35
|
+
FileData,
|
|
36
|
+
FileDataSourceMetadata,
|
|
37
|
+
SourceIdentifiers,
|
|
38
|
+
)
|
|
37
39
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
38
40
|
|
|
39
41
|
if TYPE_CHECKING:
|
|
@@ -8,9 +8,6 @@ from unstructured_ingest.error import DestinationConnectionError
|
|
|
8
8
|
from unstructured_ingest.utils.data_prep import split_dataframe
|
|
9
9
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
10
10
|
from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
11
|
-
from unstructured_ingest.v2.interfaces import (
|
|
12
|
-
FileData,
|
|
13
|
-
)
|
|
14
11
|
from unstructured_ingest.v2.logger import logger
|
|
15
12
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
16
13
|
DestinationRegistryEntry,
|
|
@@ -29,6 +26,9 @@ from unstructured_ingest.v2.processes.connectors.sql.sql import (
|
|
|
29
26
|
SQLUploadStager,
|
|
30
27
|
SQLUploadStagerConfig,
|
|
31
28
|
)
|
|
29
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
30
|
+
FileData,
|
|
31
|
+
)
|
|
32
32
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
33
33
|
|
|
34
34
|
if TYPE_CHECKING:
|
|
@@ -14,7 +14,6 @@ from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
|
14
14
|
from unstructured_ingest.v2.interfaces import (
|
|
15
15
|
AccessConfig,
|
|
16
16
|
ConnectionConfig,
|
|
17
|
-
FileData,
|
|
18
17
|
Uploader,
|
|
19
18
|
UploaderConfig,
|
|
20
19
|
UploadStager,
|
|
@@ -22,6 +21,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
22
21
|
)
|
|
23
22
|
from unstructured_ingest.v2.logger import logger
|
|
24
23
|
from unstructured_ingest.v2.processes.connector_registry import DestinationRegistryEntry
|
|
24
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
25
25
|
|
|
26
26
|
BASE_URL = "https://api.vectara.io/v2"
|
|
27
27
|
|
|
@@ -16,13 +16,13 @@ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
|
|
|
16
16
|
from unstructured_ingest.v2.interfaces import (
|
|
17
17
|
AccessConfig,
|
|
18
18
|
ConnectionConfig,
|
|
19
|
-
FileData,
|
|
20
19
|
UploaderConfig,
|
|
21
20
|
UploadStager,
|
|
22
21
|
UploadStagerConfig,
|
|
23
22
|
VectorDBUploader,
|
|
24
23
|
)
|
|
25
24
|
from unstructured_ingest.v2.logger import logger
|
|
25
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
26
26
|
|
|
27
27
|
if TYPE_CHECKING:
|
|
28
28
|
from weaviate.classes.init import Timeout
|
|
@@ -16,14 +16,16 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
16
16
|
Downloader,
|
|
17
17
|
DownloaderConfig,
|
|
18
18
|
DownloadResponse,
|
|
19
|
-
FileData,
|
|
20
|
-
FileDataSourceMetadata,
|
|
21
19
|
Indexer,
|
|
22
20
|
IndexerConfig,
|
|
23
|
-
SourceIdentifiers,
|
|
24
21
|
)
|
|
25
22
|
from unstructured_ingest.v2.logger import logger
|
|
26
23
|
from unstructured_ingest.v2.processes.connector_registry import SourceRegistryEntry
|
|
24
|
+
from unstructured_ingest.v2.types.file_data import (
|
|
25
|
+
FileData,
|
|
26
|
+
FileDataSourceMetadata,
|
|
27
|
+
SourceIdentifiers,
|
|
28
|
+
)
|
|
27
29
|
|
|
28
30
|
from .client import ZendeskArticle, ZendeskClient, ZendeskTicket
|
|
29
31
|
|
|
@@ -5,9 +5,9 @@ from typing import Any, Callable, Optional
|
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel, Field
|
|
7
7
|
|
|
8
|
-
from unstructured_ingest.v2.interfaces import FileData
|
|
9
8
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
10
9
|
from unstructured_ingest.v2.logger import logger
|
|
10
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class FiltererConfig(BaseModel):
|
|
@@ -8,9 +8,9 @@ from uuid import NAMESPACE_DNS, uuid5
|
|
|
8
8
|
from pydantic import BaseModel
|
|
9
9
|
|
|
10
10
|
from unstructured_ingest.utils.compression import TAR_FILE_EXT, ZIP_FILE_EXT, uncompress_file
|
|
11
|
-
from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
12
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
13
12
|
from unstructured_ingest.v2.logger import logger
|
|
13
|
+
from unstructured_ingest.v2.types.file_data import FileData, SourceIdentifiers
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class UncompressConfig(BaseModel):
|
|
@@ -3,7 +3,8 @@ from pathlib import Path
|
|
|
3
3
|
from typing import Any
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
6
|
-
from unstructured_ingest.v2.interfaces import
|
|
6
|
+
from unstructured_ingest.v2.interfaces import UploadStager, UploadStagerConfig
|
|
7
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class BlobStoreUploadStagerConfig(UploadStagerConfig):
|
unstructured_ingest/v2/utils.py
CHANGED
|
@@ -8,7 +8,7 @@ from uuid import NAMESPACE_DNS, uuid5
|
|
|
8
8
|
from pydantic import BaseModel
|
|
9
9
|
from pydantic.types import _SecretBase
|
|
10
10
|
|
|
11
|
-
from unstructured_ingest.v2.
|
|
11
|
+
from unstructured_ingest.v2.types.file_data import FileData
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
def is_secret(value: Any) -> bool:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: unstructured-ingest
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: A library that prepares raw documents for downstream ML tasks.
|
|
5
5
|
Home-page: https://github.com/Unstructured-IO/unstructured-ingest
|
|
6
6
|
Author: Unstructured Technologies
|
|
@@ -22,11 +22,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
|
22
22
|
Requires-Python: >=3.9.0,<3.14
|
|
23
23
|
Description-Content-Type: text/markdown
|
|
24
24
|
License-File: LICENSE.md
|
|
25
|
+
Requires-Dist: click
|
|
26
|
+
Requires-Dist: tqdm
|
|
25
27
|
Requires-Dist: opentelemetry-sdk
|
|
26
28
|
Requires-Dist: python-dateutil
|
|
27
|
-
Requires-Dist: click
|
|
28
29
|
Requires-Dist: dataclasses_json
|
|
29
|
-
Requires-Dist: tqdm
|
|
30
30
|
Requires-Dist: pydantic>=2.7
|
|
31
31
|
Requires-Dist: numpy
|
|
32
32
|
Requires-Dist: pandas
|
|
@@ -117,8 +117,8 @@ Requires-Dist: bs4; extra == "biomed"
|
|
|
117
117
|
Requires-Dist: numpy; extra == "biomed"
|
|
118
118
|
Requires-Dist: pandas; extra == "biomed"
|
|
119
119
|
Provides-Extra: box
|
|
120
|
-
Requires-Dist: boxfs; extra == "box"
|
|
121
120
|
Requires-Dist: fsspec; extra == "box"
|
|
121
|
+
Requires-Dist: boxfs; extra == "box"
|
|
122
122
|
Requires-Dist: numpy; extra == "box"
|
|
123
123
|
Requires-Dist: pandas; extra == "box"
|
|
124
124
|
Provides-Extra: chroma
|
|
@@ -130,8 +130,8 @@ Requires-Dist: clarifai; extra == "clarifai"
|
|
|
130
130
|
Requires-Dist: numpy; extra == "clarifai"
|
|
131
131
|
Requires-Dist: pandas; extra == "clarifai"
|
|
132
132
|
Provides-Extra: confluence
|
|
133
|
-
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
134
133
|
Requires-Dist: requests; extra == "confluence"
|
|
134
|
+
Requires-Dist: atlassian-python-api; extra == "confluence"
|
|
135
135
|
Requires-Dist: numpy; extra == "confluence"
|
|
136
136
|
Requires-Dist: pandas; extra == "confluence"
|
|
137
137
|
Provides-Extra: couchbase
|
|
@@ -185,10 +185,10 @@ Requires-Dist: urllib3; extra == "hubspot"
|
|
|
185
185
|
Requires-Dist: numpy; extra == "hubspot"
|
|
186
186
|
Requires-Dist: pandas; extra == "hubspot"
|
|
187
187
|
Provides-Extra: ibm-watsonx-s3
|
|
188
|
-
Requires-Dist:
|
|
188
|
+
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
189
189
|
Requires-Dist: pyiceberg; extra == "ibm-watsonx-s3"
|
|
190
|
+
Requires-Dist: tenacity; extra == "ibm-watsonx-s3"
|
|
190
191
|
Requires-Dist: pyarrow; extra == "ibm-watsonx-s3"
|
|
191
|
-
Requires-Dist: httpx; extra == "ibm-watsonx-s3"
|
|
192
192
|
Requires-Dist: numpy; extra == "ibm-watsonx-s3"
|
|
193
193
|
Requires-Dist: pandas; extra == "ibm-watsonx-s3"
|
|
194
194
|
Provides-Extra: jira
|
|
@@ -222,16 +222,16 @@ Requires-Dist: neo4j-rust-ext; extra == "neo4j"
|
|
|
222
222
|
Requires-Dist: numpy; extra == "neo4j"
|
|
223
223
|
Requires-Dist: pandas; extra == "neo4j"
|
|
224
224
|
Provides-Extra: notion
|
|
225
|
+
Requires-Dist: httpx; extra == "notion"
|
|
225
226
|
Requires-Dist: notion-client; extra == "notion"
|
|
226
|
-
Requires-Dist: backoff; extra == "notion"
|
|
227
227
|
Requires-Dist: htmlBuilder; extra == "notion"
|
|
228
|
-
Requires-Dist:
|
|
228
|
+
Requires-Dist: backoff; extra == "notion"
|
|
229
229
|
Requires-Dist: numpy; extra == "notion"
|
|
230
230
|
Requires-Dist: pandas; extra == "notion"
|
|
231
231
|
Provides-Extra: onedrive
|
|
232
|
-
Requires-Dist: msal; extra == "onedrive"
|
|
233
232
|
Requires-Dist: Office365-REST-Python-Client; extra == "onedrive"
|
|
234
233
|
Requires-Dist: bs4; extra == "onedrive"
|
|
234
|
+
Requires-Dist: msal; extra == "onedrive"
|
|
235
235
|
Requires-Dist: numpy; extra == "onedrive"
|
|
236
236
|
Requires-Dist: pandas; extra == "onedrive"
|
|
237
237
|
Provides-Extra: opensearch
|
|
@@ -239,8 +239,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
|
239
239
|
Requires-Dist: numpy; extra == "opensearch"
|
|
240
240
|
Requires-Dist: pandas; extra == "opensearch"
|
|
241
241
|
Provides-Extra: outlook
|
|
242
|
-
Requires-Dist: msal; extra == "outlook"
|
|
243
242
|
Requires-Dist: Office365-REST-Python-Client; extra == "outlook"
|
|
243
|
+
Requires-Dist: msal; extra == "outlook"
|
|
244
244
|
Requires-Dist: numpy; extra == "outlook"
|
|
245
245
|
Requires-Dist: pandas; extra == "outlook"
|
|
246
246
|
Provides-Extra: pinecone
|
|
@@ -264,13 +264,13 @@ Requires-Dist: redis; extra == "redis"
|
|
|
264
264
|
Requires-Dist: numpy; extra == "redis"
|
|
265
265
|
Requires-Dist: pandas; extra == "redis"
|
|
266
266
|
Provides-Extra: s3
|
|
267
|
-
Requires-Dist: fsspec; extra == "s3"
|
|
268
267
|
Requires-Dist: s3fs; extra == "s3"
|
|
268
|
+
Requires-Dist: fsspec; extra == "s3"
|
|
269
269
|
Requires-Dist: numpy; extra == "s3"
|
|
270
270
|
Requires-Dist: pandas; extra == "s3"
|
|
271
271
|
Provides-Extra: sharepoint
|
|
272
|
-
Requires-Dist: msal; extra == "sharepoint"
|
|
273
272
|
Requires-Dist: Office365-REST-Python-Client; extra == "sharepoint"
|
|
273
|
+
Requires-Dist: msal; extra == "sharepoint"
|
|
274
274
|
Requires-Dist: numpy; extra == "sharepoint"
|
|
275
275
|
Requires-Dist: pandas; extra == "sharepoint"
|
|
276
276
|
Provides-Extra: salesforce
|
|
@@ -318,14 +318,14 @@ Requires-Dist: httpx; extra == "vectara"
|
|
|
318
318
|
Requires-Dist: numpy; extra == "vectara"
|
|
319
319
|
Requires-Dist: pandas; extra == "vectara"
|
|
320
320
|
Provides-Extra: vastdb
|
|
321
|
-
Requires-Dist: pyarrow; extra == "vastdb"
|
|
322
|
-
Requires-Dist: ibis; extra == "vastdb"
|
|
323
321
|
Requires-Dist: vastdb; extra == "vastdb"
|
|
322
|
+
Requires-Dist: ibis; extra == "vastdb"
|
|
323
|
+
Requires-Dist: pyarrow; extra == "vastdb"
|
|
324
324
|
Requires-Dist: numpy; extra == "vastdb"
|
|
325
325
|
Requires-Dist: pandas; extra == "vastdb"
|
|
326
326
|
Provides-Extra: zendesk
|
|
327
|
-
Requires-Dist: aiofiles; extra == "zendesk"
|
|
328
327
|
Requires-Dist: httpx; extra == "zendesk"
|
|
328
|
+
Requires-Dist: aiofiles; extra == "zendesk"
|
|
329
329
|
Requires-Dist: bs4; extra == "zendesk"
|
|
330
330
|
Requires-Dist: numpy; extra == "zendesk"
|
|
331
331
|
Requires-Dist: pandas; extra == "zendesk"
|