unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/README.md +28 -0
- unstructured_ingest/embed/mixedbreadai.py +0 -1
- unstructured_ingest/interfaces/upload_stager.py +2 -2
- unstructured_ingest/interfaces/uploader.py +3 -3
- unstructured_ingest/main.py +0 -0
- unstructured_ingest/pipeline/interfaces.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/processes/chunker.py +4 -0
- unstructured_ingest/processes/connectors/airtable.py +4 -2
- unstructured_ingest/processes/connectors/astradb.py +2 -2
- unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
- unstructured_ingest/processes/connectors/confluence.py +0 -1
- unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
- unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
- unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
- unstructured_ingest/processes/connectors/delta_table.py +1 -0
- unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
- unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
- unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
- unstructured_ingest/processes/connectors/gitlab.py +1 -2
- unstructured_ingest/processes/connectors/google_drive.py +0 -2
- unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
- unstructured_ingest/processes/connectors/kdbai.py +1 -0
- unstructured_ingest/processes/connectors/outlook.py +1 -2
- unstructured_ingest/processes/connectors/pinecone.py +0 -1
- unstructured_ingest/processes/connectors/redisdb.py +28 -24
- unstructured_ingest/processes/connectors/salesforce.py +1 -1
- unstructured_ingest/processes/connectors/slack.py +1 -2
- unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
- unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
- unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
- unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
- unstructured_ingest/processes/connectors/sql/sql.py +3 -4
- unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
- unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
- unstructured_ingest/processes/connectors/vectara.py +0 -2
- unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
- unstructured_ingest/processes/embedder.py +2 -2
- unstructured_ingest/processes/filter.py +1 -1
- unstructured_ingest/processes/partitioner.py +4 -0
- unstructured_ingest/processes/utils/blob_storage.py +2 -2
- unstructured_ingest/unstructured_api.py +13 -8
- unstructured_ingest/utils/data_prep.py +8 -32
- unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
- examples/__init__.py +0 -0
- examples/airtable.py +0 -44
- examples/azure_cognitive_search.py +0 -55
- examples/chroma.py +0 -54
- examples/couchbase.py +0 -55
- examples/databricks_volumes_dest.py +0 -55
- examples/databricks_volumes_source.py +0 -53
- examples/delta_table.py +0 -45
- examples/discord_example.py +0 -36
- examples/elasticsearch.py +0 -49
- examples/google_drive.py +0 -45
- examples/kdbai.py +0 -54
- examples/local.py +0 -36
- examples/milvus.py +0 -44
- examples/mongodb.py +0 -53
- examples/opensearch.py +0 -50
- examples/pinecone.py +0 -57
- examples/s3.py +0 -38
- examples/salesforce.py +0 -44
- examples/sharepoint.py +0 -47
- examples/singlestore.py +0 -49
- examples/sql.py +0 -90
- examples/vectara.py +0 -54
- examples/weaviate.py +0 -44
- test/__init__.py +0 -0
- test/integration/__init__.py +0 -0
- test/integration/chunkers/__init__.py +0 -0
- test/integration/chunkers/test_chunkers.py +0 -31
- test/integration/connectors/__init__.py +0 -0
- test/integration/connectors/conftest.py +0 -38
- test/integration/connectors/databricks/__init__.py +0 -0
- test/integration/connectors/databricks/test_volumes_native.py +0 -273
- test/integration/connectors/discord/__init__.py +0 -0
- test/integration/connectors/discord/test_discord.py +0 -90
- test/integration/connectors/duckdb/__init__.py +0 -0
- test/integration/connectors/duckdb/conftest.py +0 -14
- test/integration/connectors/duckdb/test_duckdb.py +0 -90
- test/integration/connectors/duckdb/test_motherduck.py +0 -95
- test/integration/connectors/elasticsearch/__init__.py +0 -0
- test/integration/connectors/elasticsearch/conftest.py +0 -34
- test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
- test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
- test/integration/connectors/sql/__init__.py +0 -0
- test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
- test/integration/connectors/sql/test_postgres.py +0 -201
- test/integration/connectors/sql/test_singlestore.py +0 -182
- test/integration/connectors/sql/test_snowflake.py +0 -244
- test/integration/connectors/sql/test_sqlite.py +0 -168
- test/integration/connectors/sql/test_vastdb.py +0 -34
- test/integration/connectors/test_astradb.py +0 -287
- test/integration/connectors/test_azure_ai_search.py +0 -254
- test/integration/connectors/test_chroma.py +0 -136
- test/integration/connectors/test_confluence.py +0 -111
- test/integration/connectors/test_delta_table.py +0 -183
- test/integration/connectors/test_dropbox.py +0 -151
- test/integration/connectors/test_github.py +0 -49
- test/integration/connectors/test_google_drive.py +0 -257
- test/integration/connectors/test_jira.py +0 -67
- test/integration/connectors/test_lancedb.py +0 -247
- test/integration/connectors/test_milvus.py +0 -208
- test/integration/connectors/test_mongodb.py +0 -335
- test/integration/connectors/test_neo4j.py +0 -244
- test/integration/connectors/test_notion.py +0 -152
- test/integration/connectors/test_onedrive.py +0 -163
- test/integration/connectors/test_pinecone.py +0 -387
- test/integration/connectors/test_qdrant.py +0 -216
- test/integration/connectors/test_redis.py +0 -143
- test/integration/connectors/test_s3.py +0 -184
- test/integration/connectors/test_sharepoint.py +0 -222
- test/integration/connectors/test_vectara.py +0 -282
- test/integration/connectors/test_zendesk.py +0 -120
- test/integration/connectors/utils/__init__.py +0 -0
- test/integration/connectors/utils/constants.py +0 -13
- test/integration/connectors/utils/docker.py +0 -151
- test/integration/connectors/utils/docker_compose.py +0 -59
- test/integration/connectors/utils/validation/__init__.py +0 -0
- test/integration/connectors/utils/validation/destination.py +0 -77
- test/integration/connectors/utils/validation/equality.py +0 -76
- test/integration/connectors/utils/validation/source.py +0 -331
- test/integration/connectors/utils/validation/utils.py +0 -36
- test/integration/connectors/weaviate/__init__.py +0 -0
- test/integration/connectors/weaviate/conftest.py +0 -15
- test/integration/connectors/weaviate/test_cloud.py +0 -39
- test/integration/connectors/weaviate/test_local.py +0 -152
- test/integration/embedders/__init__.py +0 -0
- test/integration/embedders/conftest.py +0 -13
- test/integration/embedders/test_azure_openai.py +0 -57
- test/integration/embedders/test_bedrock.py +0 -103
- test/integration/embedders/test_huggingface.py +0 -24
- test/integration/embedders/test_mixedbread.py +0 -71
- test/integration/embedders/test_octoai.py +0 -75
- test/integration/embedders/test_openai.py +0 -74
- test/integration/embedders/test_togetherai.py +0 -71
- test/integration/embedders/test_vertexai.py +0 -63
- test/integration/embedders/test_voyageai.py +0 -79
- test/integration/embedders/utils.py +0 -66
- test/integration/partitioners/__init__.py +0 -0
- test/integration/partitioners/test_partitioner.py +0 -76
- test/integration/utils.py +0 -15
- test/unit/__init__.py +0 -0
- test/unit/chunkers/__init__.py +0 -0
- test/unit/chunkers/test_chunkers.py +0 -49
- test/unit/connectors/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/__init__.py +0 -0
- test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
- test/unit/connectors/motherduck/__init__.py +0 -0
- test/unit/connectors/motherduck/test_base.py +0 -73
- test/unit/connectors/sql/__init__.py +0 -0
- test/unit/connectors/sql/test_sql.py +0 -152
- test/unit/connectors/test_confluence.py +0 -71
- test/unit/connectors/test_jira.py +0 -401
- test/unit/embed/__init__.py +0 -0
- test/unit/embed/test_mixedbreadai.py +0 -42
- test/unit/embed/test_octoai.py +0 -27
- test/unit/embed/test_openai.py +0 -28
- test/unit/embed/test_vertexai.py +0 -25
- test/unit/embed/test_voyageai.py +0 -24
- test/unit/embedders/__init__.py +0 -0
- test/unit/embedders/test_bedrock.py +0 -36
- test/unit/embedders/test_huggingface.py +0 -48
- test/unit/embedders/test_mixedbread.py +0 -37
- test/unit/embedders/test_octoai.py +0 -35
- test/unit/embedders/test_openai.py +0 -35
- test/unit/embedders/test_togetherai.py +0 -37
- test/unit/embedders/test_vertexai.py +0 -37
- test/unit/embedders/test_voyageai.py +0 -38
- test/unit/partitioners/__init__.py +0 -0
- test/unit/partitioners/test_partitioner.py +0 -63
- test/unit/test_error.py +0 -27
- test/unit/test_html.py +0 -112
- test/unit/test_interfaces.py +0 -26
- test/unit/test_utils.py +0 -220
- test/unit/utils/__init__.py +0 -0
- test/unit/utils/data_generator.py +0 -32
- unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
- unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
examples/singlestore.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
from unstructured_ingest.interfaces import ProcessorConfig
|
|
4
|
-
from unstructured_ingest.logger import logger
|
|
5
|
-
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
6
|
-
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
7
|
-
from unstructured_ingest.processes.connectors.local import (
|
|
8
|
-
LocalConnectionConfig,
|
|
9
|
-
LocalDownloaderConfig,
|
|
10
|
-
LocalIndexerConfig,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.processes.connectors.singlestore import (
|
|
13
|
-
CONNECTOR_TYPE,
|
|
14
|
-
SingleStoreAccessConfig,
|
|
15
|
-
SingleStoreConnectionConfig,
|
|
16
|
-
SingleStoreUploaderConfig,
|
|
17
|
-
SingleStoreUploadStagerConfig,
|
|
18
|
-
)
|
|
19
|
-
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
20
|
-
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
21
|
-
|
|
22
|
-
base_path = Path(__file__).parent.parent.parent.parent
|
|
23
|
-
docs_path = base_path / "example-docs"
|
|
24
|
-
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
25
|
-
output_path = work_dir / "output"
|
|
26
|
-
download_path = work_dir / "download"
|
|
27
|
-
|
|
28
|
-
if __name__ == "__main__":
|
|
29
|
-
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
30
|
-
Pipeline.from_configs(
|
|
31
|
-
context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
|
|
32
|
-
indexer_config=LocalIndexerConfig(
|
|
33
|
-
input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
|
|
34
|
-
),
|
|
35
|
-
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
|
36
|
-
source_connection_config=LocalConnectionConfig(),
|
|
37
|
-
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
38
|
-
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
|
39
|
-
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
40
|
-
destination_connection_config=SingleStoreConnectionConfig(
|
|
41
|
-
access_config=SingleStoreAccessConfig(password="password"),
|
|
42
|
-
host="localhost",
|
|
43
|
-
port=3306,
|
|
44
|
-
database="ingest_test",
|
|
45
|
-
user="root",
|
|
46
|
-
),
|
|
47
|
-
stager_config=SingleStoreUploadStagerConfig(),
|
|
48
|
-
uploader_config=SingleStoreUploaderConfig(table_name="elements"),
|
|
49
|
-
).run()
|
examples/sql.py
DELETED
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import sqlite3
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from unstructured_ingest.interfaces import ProcessorConfig
|
|
6
|
-
from unstructured_ingest.logger import logger
|
|
7
|
-
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
8
|
-
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
9
|
-
from unstructured_ingest.processes.connectors.local import (
|
|
10
|
-
LocalConnectionConfig,
|
|
11
|
-
LocalDownloaderConfig,
|
|
12
|
-
LocalIndexerConfig,
|
|
13
|
-
)
|
|
14
|
-
from unstructured_ingest.processes.connectors.sql import (
|
|
15
|
-
CONNECTOR_TYPE,
|
|
16
|
-
POSTGRESQL_DB,
|
|
17
|
-
SQLITE_DB,
|
|
18
|
-
SQLAccessConfig,
|
|
19
|
-
SQLConnectionConfig,
|
|
20
|
-
SQLUploaderConfig,
|
|
21
|
-
SQLUploadStagerConfig,
|
|
22
|
-
)
|
|
23
|
-
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
24
|
-
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
25
|
-
|
|
26
|
-
base_path = Path(__file__).parent.parent.parent.parent
|
|
27
|
-
docs_path = base_path / "example-docs"
|
|
28
|
-
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
29
|
-
output_path = work_dir / "output"
|
|
30
|
-
download_path = work_dir / "download"
|
|
31
|
-
|
|
32
|
-
SQLITE_DB_PATH = "test-sql-db.sqlite"
|
|
33
|
-
|
|
34
|
-
if __name__ == "__main__":
|
|
35
|
-
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
36
|
-
|
|
37
|
-
configs = {
|
|
38
|
-
"context": ProcessorConfig(work_dir=str(work_dir.resolve())),
|
|
39
|
-
"indexer_config": LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
|
|
40
|
-
"downloader_config": LocalDownloaderConfig(download_dir=download_path),
|
|
41
|
-
"source_connection_config": LocalConnectionConfig(),
|
|
42
|
-
"partitioner_config": PartitionerConfig(strategy="fast"),
|
|
43
|
-
"chunker_config": ChunkerConfig(
|
|
44
|
-
chunking_strategy="by_title",
|
|
45
|
-
chunk_include_orig_elements=False,
|
|
46
|
-
chunk_max_characters=1500,
|
|
47
|
-
chunk_multipage_sections=True,
|
|
48
|
-
),
|
|
49
|
-
"embedder_config": EmbedderConfig(embedding_provider="huggingface"),
|
|
50
|
-
"stager_config": SQLUploadStagerConfig(),
|
|
51
|
-
"uploader_config": SQLUploaderConfig(batch_size=10),
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
if os.path.exists(SQLITE_DB):
|
|
55
|
-
os.remove(SQLITE_DB)
|
|
56
|
-
|
|
57
|
-
connection = sqlite3.connect(database=SQLITE_DB)
|
|
58
|
-
|
|
59
|
-
query = None
|
|
60
|
-
script_path = (
|
|
61
|
-
Path(__file__).parent.parent.parent.parent.parent
|
|
62
|
-
/ Path("test_e2e/env_setup/sql/sqlite-schema.sql")
|
|
63
|
-
).resolve()
|
|
64
|
-
with open(script_path) as f:
|
|
65
|
-
query = f.read()
|
|
66
|
-
cursor = connection.cursor()
|
|
67
|
-
cursor.executescript(query)
|
|
68
|
-
connection.close()
|
|
69
|
-
|
|
70
|
-
# sqlite test first
|
|
71
|
-
Pipeline.from_configs(
|
|
72
|
-
destination_connection_config=SQLConnectionConfig(
|
|
73
|
-
db_type=SQLITE_DB,
|
|
74
|
-
database=SQLITE_DB_PATH,
|
|
75
|
-
access_config=SQLAccessConfig(),
|
|
76
|
-
),
|
|
77
|
-
**configs,
|
|
78
|
-
).run()
|
|
79
|
-
|
|
80
|
-
# now, pg with pgvector
|
|
81
|
-
Pipeline.from_configs(
|
|
82
|
-
destination_connection_config=SQLConnectionConfig(
|
|
83
|
-
db_type=POSTGRESQL_DB,
|
|
84
|
-
database="elements",
|
|
85
|
-
host="localhost",
|
|
86
|
-
port=5433,
|
|
87
|
-
access_config=SQLAccessConfig(username="unstructured", password="test"),
|
|
88
|
-
),
|
|
89
|
-
**configs,
|
|
90
|
-
).run()
|
examples/vectara.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
from unstructured_ingest.interfaces import ProcessorConfig
|
|
4
|
-
from unstructured_ingest.logger import logger
|
|
5
|
-
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
6
|
-
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
7
|
-
from unstructured_ingest.processes.connectors.local import (
|
|
8
|
-
LocalConnectionConfig,
|
|
9
|
-
LocalDownloaderConfig,
|
|
10
|
-
LocalIndexerConfig,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.processes.connectors.vectara import (
|
|
13
|
-
CONNECTOR_TYPE,
|
|
14
|
-
VectaraAccessConfig,
|
|
15
|
-
VectaraConnectionConfig,
|
|
16
|
-
VectaraUploaderConfig,
|
|
17
|
-
VectaraUploadStagerConfig,
|
|
18
|
-
)
|
|
19
|
-
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
20
|
-
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
21
|
-
|
|
22
|
-
base_path = Path(__file__).parent.parent.parent.parent
|
|
23
|
-
docs_path = base_path / "example-docs"
|
|
24
|
-
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
25
|
-
output_path = work_dir / "output"
|
|
26
|
-
download_path = work_dir / "download"
|
|
27
|
-
|
|
28
|
-
if __name__ == "__main__":
|
|
29
|
-
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
30
|
-
Pipeline.from_configs(
|
|
31
|
-
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
|
32
|
-
indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
|
|
33
|
-
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
|
34
|
-
source_connection_config=LocalConnectionConfig(),
|
|
35
|
-
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
36
|
-
chunker_config=ChunkerConfig(
|
|
37
|
-
chunking_strategy="by_title",
|
|
38
|
-
chunk_include_orig_elements=False,
|
|
39
|
-
chunk_max_characters=1500,
|
|
40
|
-
chunk_multipage_sections=True,
|
|
41
|
-
),
|
|
42
|
-
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
43
|
-
destination_connection_config=VectaraConnectionConfig(
|
|
44
|
-
access_config=VectaraAccessConfig(
|
|
45
|
-
oauth_client_id="fill oauth_client_id", oauth_secret="fill oauth_secret"
|
|
46
|
-
),
|
|
47
|
-
customer_id="fill customer_id",
|
|
48
|
-
corpus_name="fill corpus_name",
|
|
49
|
-
corpus_key="fill corpus_key",
|
|
50
|
-
token_url="fill token_url",
|
|
51
|
-
),
|
|
52
|
-
stager_config=VectaraUploadStagerConfig(batch_size=10),
|
|
53
|
-
uploader_config=VectaraUploaderConfig(),
|
|
54
|
-
).run()
|
examples/weaviate.py
DELETED
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
from unstructured_ingest.interfaces import ProcessorConfig
|
|
4
|
-
from unstructured_ingest.logger import logger
|
|
5
|
-
from unstructured_ingest.pipeline.pipeline import Pipeline
|
|
6
|
-
from unstructured_ingest.processes.chunker import ChunkerConfig
|
|
7
|
-
from unstructured_ingest.processes.connectors.local import (
|
|
8
|
-
LocalConnectionConfig,
|
|
9
|
-
LocalDownloaderConfig,
|
|
10
|
-
LocalIndexerConfig,
|
|
11
|
-
)
|
|
12
|
-
from unstructured_ingest.processes.connectors.weaviate.local import (
|
|
13
|
-
CONNECTOR_TYPE,
|
|
14
|
-
LocalWeaviateConnectionConfig,
|
|
15
|
-
LocalWeaviateUploaderConfig,
|
|
16
|
-
LocalWeaviateUploadStagerConfig,
|
|
17
|
-
)
|
|
18
|
-
from unstructured_ingest.processes.embedder import EmbedderConfig
|
|
19
|
-
from unstructured_ingest.processes.partitioner import PartitionerConfig
|
|
20
|
-
|
|
21
|
-
base_path = Path(__file__).parent.parent.parent.parent
|
|
22
|
-
docs_path = base_path / "example-docs"
|
|
23
|
-
work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
|
|
24
|
-
output_path = work_dir / "output"
|
|
25
|
-
download_path = work_dir / "download"
|
|
26
|
-
|
|
27
|
-
if __name__ == "__main__":
|
|
28
|
-
logger.info(f"writing all content in: {work_dir.resolve()}")
|
|
29
|
-
Pipeline.from_configs(
|
|
30
|
-
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
|
|
31
|
-
indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
|
|
32
|
-
downloader_config=LocalDownloaderConfig(download_dir=download_path),
|
|
33
|
-
source_connection_config=LocalConnectionConfig(),
|
|
34
|
-
partitioner_config=PartitionerConfig(strategy="fast"),
|
|
35
|
-
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
|
|
36
|
-
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
|
|
37
|
-
destination_connection_config=LocalWeaviateConnectionConfig(
|
|
38
|
-
# Connects to http://localhost:8080
|
|
39
|
-
),
|
|
40
|
-
stager_config=LocalWeaviateUploadStagerConfig(),
|
|
41
|
-
uploader_config=LocalWeaviateUploaderConfig(
|
|
42
|
-
collection="elements", batch_size=10, dynamic_batch=False
|
|
43
|
-
),
|
|
44
|
-
).run()
|
test/__init__.py
DELETED
|
File without changes
|
test/integration/__init__.py
DELETED
|
File without changes
|
|
File without changes
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
|
|
4
|
-
import pytest
|
|
5
|
-
|
|
6
|
-
from test.integration.utils import requires_env
|
|
7
|
-
from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
|
|
8
|
-
|
|
9
|
-
int_test_dir = Path(__file__).parent
|
|
10
|
-
assets_dir = int_test_dir / "assets"
|
|
11
|
-
|
|
12
|
-
chunker_files = [path for path in assets_dir.iterdir() if path.is_file()]
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
|
|
16
|
-
@pytest.mark.parametrize("strategy", ["basic", "by_title", "by_similarity", "by_page"])
|
|
17
|
-
@requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
|
|
18
|
-
@pytest.mark.asyncio
|
|
19
|
-
async def test_chunker_api(chunker_file: Path, strategy: str):
|
|
20
|
-
api_key = os.getenv("UNSTRUCTURED_API_KEY")
|
|
21
|
-
api_url = os.getenv("UNSTRUCTURED_API_URL")
|
|
22
|
-
|
|
23
|
-
chunker_config = ChunkerConfig(
|
|
24
|
-
chunking_strategy=strategy,
|
|
25
|
-
chunk_by_api=True,
|
|
26
|
-
chunk_api_key=api_key,
|
|
27
|
-
chunking_endpoint=api_url,
|
|
28
|
-
)
|
|
29
|
-
chunker = Chunker(config=chunker_config)
|
|
30
|
-
results = await chunker.run_async(elements_filepath=chunker_file)
|
|
31
|
-
assert results
|
|
File without changes
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
import tempfile
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
from typing import Generator
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
from unstructured_ingest.logger import logger
|
|
8
|
-
|
|
9
|
-
FILENAME = Path("DA-1p-with-duplicate-pages.pdf.json")
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@pytest.fixture
|
|
13
|
-
def upload_file() -> Path:
|
|
14
|
-
int_test_dir = Path(__file__).parent
|
|
15
|
-
assets_dir = int_test_dir / "assets"
|
|
16
|
-
upload_file = assets_dir / FILENAME
|
|
17
|
-
assert upload_file.exists()
|
|
18
|
-
assert upload_file.is_file()
|
|
19
|
-
return upload_file
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
@pytest.fixture
|
|
23
|
-
def upload_file_ndjson() -> Path:
|
|
24
|
-
int_test_dir = Path(__file__).parent
|
|
25
|
-
assets_dir = int_test_dir / "assets"
|
|
26
|
-
upload_file = assets_dir / FILENAME.with_suffix(".ndjson")
|
|
27
|
-
assert upload_file.exists()
|
|
28
|
-
assert upload_file.is_file()
|
|
29
|
-
return upload_file
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
@pytest.fixture
|
|
33
|
-
def temp_dir() -> Generator[Path, None, None]:
|
|
34
|
-
with tempfile.TemporaryDirectory() as temp_dir:
|
|
35
|
-
temp_path = Path(temp_dir)
|
|
36
|
-
logger.info(f"Created temp dir '{temp_path}'")
|
|
37
|
-
yield temp_path
|
|
38
|
-
logger.info(f"Removing temp dir '{temp_path}'")
|
|
File without changes
|
|
@@ -1,273 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import uuid
|
|
4
|
-
from contextlib import contextmanager
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from unittest import mock
|
|
8
|
-
|
|
9
|
-
import pytest
|
|
10
|
-
from databricks.sdk import WorkspaceClient
|
|
11
|
-
from databricks.sdk.errors.platform import NotFound
|
|
12
|
-
|
|
13
|
-
from test.integration.connectors.utils.constants import (
|
|
14
|
-
BLOB_STORAGE_TAG,
|
|
15
|
-
DESTINATION_TAG,
|
|
16
|
-
SOURCE_TAG,
|
|
17
|
-
)
|
|
18
|
-
from test.integration.connectors.utils.validation.source import (
|
|
19
|
-
SourceValidationConfigs,
|
|
20
|
-
source_connector_validation,
|
|
21
|
-
)
|
|
22
|
-
from test.integration.utils import requires_env
|
|
23
|
-
from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
|
|
24
|
-
from unstructured_ingest.errors_v2 import UserAuthError, UserError
|
|
25
|
-
from unstructured_ingest.processes.connectors.databricks.volumes_native import (
|
|
26
|
-
CONNECTOR_TYPE,
|
|
27
|
-
DatabricksNativeVolumesAccessConfig,
|
|
28
|
-
DatabricksNativeVolumesConnectionConfig,
|
|
29
|
-
DatabricksNativeVolumesDownloader,
|
|
30
|
-
DatabricksNativeVolumesDownloaderConfig,
|
|
31
|
-
DatabricksNativeVolumesIndexer,
|
|
32
|
-
DatabricksNativeVolumesIndexerConfig,
|
|
33
|
-
DatabricksNativeVolumesUploader,
|
|
34
|
-
DatabricksNativeVolumesUploaderConfig,
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
@dataclass
|
|
39
|
-
class BaseEnvData:
|
|
40
|
-
host: str
|
|
41
|
-
catalog: str
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
@dataclass
|
|
45
|
-
class BasicAuthEnvData(BaseEnvData):
|
|
46
|
-
client_id: str
|
|
47
|
-
client_secret: str
|
|
48
|
-
|
|
49
|
-
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
|
|
50
|
-
return DatabricksNativeVolumesConnectionConfig(
|
|
51
|
-
host=self.host,
|
|
52
|
-
access_config=DatabricksNativeVolumesAccessConfig(
|
|
53
|
-
client_id=self.client_id,
|
|
54
|
-
client_secret=self.client_secret,
|
|
55
|
-
),
|
|
56
|
-
)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
@dataclass
|
|
60
|
-
class PATEnvData(BaseEnvData):
|
|
61
|
-
token: str
|
|
62
|
-
|
|
63
|
-
def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
|
|
64
|
-
return DatabricksNativeVolumesConnectionConfig(
|
|
65
|
-
host=self.host,
|
|
66
|
-
access_config=DatabricksNativeVolumesAccessConfig(
|
|
67
|
-
token=self.token,
|
|
68
|
-
),
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def get_basic_auth_env_data() -> BasicAuthEnvData:
|
|
73
|
-
return BasicAuthEnvData(
|
|
74
|
-
host=os.environ["DATABRICKS_HOST"],
|
|
75
|
-
client_id=os.environ["DATABRICKS_CLIENT_ID"],
|
|
76
|
-
client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
|
|
77
|
-
catalog=os.environ["DATABRICKS_CATALOG"],
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def get_pat_env_data() -> PATEnvData:
|
|
82
|
-
return PATEnvData(
|
|
83
|
-
host=os.environ["DATABRICKS_HOST"],
|
|
84
|
-
catalog=os.environ["DATABRICKS_CATALOG"],
|
|
85
|
-
token=os.environ["DATABRICKS_PAT"],
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
@pytest.mark.asyncio
|
|
90
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
91
|
-
@requires_env(
|
|
92
|
-
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
|
|
93
|
-
)
|
|
94
|
-
async def test_volumes_native_source(tmp_path: Path):
|
|
95
|
-
env_data = get_basic_auth_env_data()
|
|
96
|
-
with mock.patch.dict(os.environ, clear=True):
|
|
97
|
-
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
98
|
-
recursive=True,
|
|
99
|
-
volume="test-platform",
|
|
100
|
-
volume_path="databricks-volumes-test-input",
|
|
101
|
-
catalog=env_data.catalog,
|
|
102
|
-
)
|
|
103
|
-
connection_config = env_data.get_connection_config()
|
|
104
|
-
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
|
|
105
|
-
indexer = DatabricksNativeVolumesIndexer(
|
|
106
|
-
connection_config=connection_config, index_config=indexer_config
|
|
107
|
-
)
|
|
108
|
-
downloader = DatabricksNativeVolumesDownloader(
|
|
109
|
-
connection_config=connection_config, download_config=download_config
|
|
110
|
-
)
|
|
111
|
-
await source_connector_validation(
|
|
112
|
-
indexer=indexer,
|
|
113
|
-
downloader=downloader,
|
|
114
|
-
configs=SourceValidationConfigs(
|
|
115
|
-
test_id="databricks_volumes_native",
|
|
116
|
-
expected_num_files=1,
|
|
117
|
-
),
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
@pytest.mark.asyncio
|
|
122
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
123
|
-
@requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
|
|
124
|
-
async def test_volumes_native_source_pat(tmp_path: Path):
|
|
125
|
-
env_data = get_pat_env_data()
|
|
126
|
-
with mock.patch.dict(os.environ, clear=True):
|
|
127
|
-
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
128
|
-
recursive=True,
|
|
129
|
-
volume="test-platform",
|
|
130
|
-
volume_path="databricks-volumes-test-input",
|
|
131
|
-
catalog=env_data.catalog,
|
|
132
|
-
)
|
|
133
|
-
connection_config = env_data.get_connection_config()
|
|
134
|
-
download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
|
|
135
|
-
indexer = DatabricksNativeVolumesIndexer(
|
|
136
|
-
connection_config=connection_config, index_config=indexer_config
|
|
137
|
-
)
|
|
138
|
-
downloader = DatabricksNativeVolumesDownloader(
|
|
139
|
-
connection_config=connection_config, download_config=download_config
|
|
140
|
-
)
|
|
141
|
-
await source_connector_validation(
|
|
142
|
-
indexer=indexer,
|
|
143
|
-
downloader=downloader,
|
|
144
|
-
configs=SourceValidationConfigs(
|
|
145
|
-
test_id="databricks_volumes_native_pat",
|
|
146
|
-
expected_num_files=1,
|
|
147
|
-
),
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
152
|
-
@requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
|
|
153
|
-
def test_volumes_native_source_pat_invalid_catalog():
|
|
154
|
-
env_data = get_pat_env_data()
|
|
155
|
-
with mock.patch.dict(os.environ, clear=True):
|
|
156
|
-
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
157
|
-
recursive=True,
|
|
158
|
-
volume="test-platform",
|
|
159
|
-
volume_path="databricks-volumes-test-input",
|
|
160
|
-
catalog="fake_catalog",
|
|
161
|
-
)
|
|
162
|
-
indexer = DatabricksNativeVolumesIndexer(
|
|
163
|
-
connection_config=env_data.get_connection_config(), index_config=indexer_config
|
|
164
|
-
)
|
|
165
|
-
with pytest.raises(UserError):
|
|
166
|
-
_ = list(indexer.run())
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
|
|
170
|
-
@requires_env("DATABRICKS_HOST")
|
|
171
|
-
def test_volumes_native_source_pat_invalid_pat():
|
|
172
|
-
host = os.environ["DATABRICKS_HOST"]
|
|
173
|
-
with mock.patch.dict(os.environ, clear=True):
|
|
174
|
-
indexer_config = DatabricksNativeVolumesIndexerConfig(
|
|
175
|
-
recursive=True,
|
|
176
|
-
volume="test-platform",
|
|
177
|
-
volume_path="databricks-volumes-test-input",
|
|
178
|
-
catalog="fake_catalog",
|
|
179
|
-
)
|
|
180
|
-
connection_config = DatabricksNativeVolumesConnectionConfig(
|
|
181
|
-
host=host,
|
|
182
|
-
access_config=DatabricksNativeVolumesAccessConfig(
|
|
183
|
-
token="invalid-token",
|
|
184
|
-
),
|
|
185
|
-
)
|
|
186
|
-
indexer = DatabricksNativeVolumesIndexer(
|
|
187
|
-
connection_config=connection_config, index_config=indexer_config
|
|
188
|
-
)
|
|
189
|
-
with pytest.raises(UserAuthError):
|
|
190
|
-
_ = list(indexer.run())
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def _get_volume_path(catalog: str, volume: str, volume_path: str):
|
|
194
|
-
return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
@contextmanager
|
|
198
|
-
def databricks_destination_context(
|
|
199
|
-
env_data: BasicAuthEnvData, volume: str, volume_path
|
|
200
|
-
) -> WorkspaceClient:
|
|
201
|
-
client = WorkspaceClient(
|
|
202
|
-
host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
|
|
203
|
-
)
|
|
204
|
-
try:
|
|
205
|
-
yield client
|
|
206
|
-
finally:
|
|
207
|
-
# Cleanup
|
|
208
|
-
try:
|
|
209
|
-
for file in client.files.list_directory_contents(
|
|
210
|
-
directory_path=_get_volume_path(env_data.catalog, volume, volume_path)
|
|
211
|
-
):
|
|
212
|
-
client.files.delete(file.path)
|
|
213
|
-
client.files.delete_directory(_get_volume_path(env_data.catalog, volume, volume_path))
|
|
214
|
-
except NotFound:
|
|
215
|
-
# Directory was never created, don't need to delete
|
|
216
|
-
pass
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_path: str):
|
|
220
|
-
files = list(
|
|
221
|
-
client.files.list_directory_contents(
|
|
222
|
-
directory_path=_get_volume_path(catalog, volume, volume_path)
|
|
223
|
-
)
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
assert len(files) == 1
|
|
227
|
-
|
|
228
|
-
resp = client.files.download(files[0].path)
|
|
229
|
-
data = json.loads(resp.contents.read())
|
|
230
|
-
|
|
231
|
-
assert len(data) == 22
|
|
232
|
-
element_types = {v["type"] for v in data}
|
|
233
|
-
assert len(element_types) == 1
|
|
234
|
-
assert "CompositeElement" in element_types
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
@pytest.mark.asyncio
|
|
238
|
-
@pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
|
|
239
|
-
@requires_env(
|
|
240
|
-
"DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
|
|
241
|
-
)
|
|
242
|
-
async def test_volumes_native_destination(upload_file: Path):
|
|
243
|
-
env_data = get_basic_auth_env_data()
|
|
244
|
-
volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
|
|
245
|
-
file_data = FileData(
|
|
246
|
-
source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
|
|
247
|
-
connector_type=CONNECTOR_TYPE,
|
|
248
|
-
identifier="mock file data",
|
|
249
|
-
)
|
|
250
|
-
with databricks_destination_context(
|
|
251
|
-
volume="test-platform", volume_path=volume_path, env_data=env_data
|
|
252
|
-
) as workspace_client:
|
|
253
|
-
connection_config = env_data.get_connection_config()
|
|
254
|
-
uploader = DatabricksNativeVolumesUploader(
|
|
255
|
-
connection_config=connection_config,
|
|
256
|
-
upload_config=DatabricksNativeVolumesUploaderConfig(
|
|
257
|
-
volume="test-platform",
|
|
258
|
-
volume_path=volume_path,
|
|
259
|
-
catalog=env_data.catalog,
|
|
260
|
-
),
|
|
261
|
-
)
|
|
262
|
-
uploader.precheck()
|
|
263
|
-
if uploader.is_async():
|
|
264
|
-
await uploader.run_async(path=upload_file, file_data=file_data)
|
|
265
|
-
else:
|
|
266
|
-
uploader.run(path=upload_file, file_data=file_data)
|
|
267
|
-
|
|
268
|
-
validate_upload(
|
|
269
|
-
client=workspace_client,
|
|
270
|
-
catalog=env_data.catalog,
|
|
271
|
-
volume="test-platform",
|
|
272
|
-
volume_path=volume_path,
|
|
273
|
-
)
|
|
File without changes
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import tempfile
|
|
3
|
-
from dataclasses import dataclass
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
7
|
-
import pytest
|
|
8
|
-
|
|
9
|
-
from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
|
|
10
|
-
from test.integration.connectors.utils.validation.source import (
|
|
11
|
-
SourceValidationConfigs,
|
|
12
|
-
source_connector_validation,
|
|
13
|
-
)
|
|
14
|
-
from test.integration.utils import requires_env
|
|
15
|
-
from unstructured_ingest.error import SourceConnectionError
|
|
16
|
-
from unstructured_ingest.processes.connectors.discord import (
|
|
17
|
-
CONNECTOR_TYPE,
|
|
18
|
-
DiscordAccessConfig,
|
|
19
|
-
DiscordConnectionConfig,
|
|
20
|
-
DiscordDownloader,
|
|
21
|
-
DiscordDownloaderConfig,
|
|
22
|
-
DiscordIndexer,
|
|
23
|
-
DiscordIndexerConfig,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
@dataclass(frozen=True)
|
|
28
|
-
class EnvData:
|
|
29
|
-
token: Optional[str]
|
|
30
|
-
channels: Optional[list[str]]
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def get_env_data() -> EnvData:
|
|
34
|
-
return EnvData(
|
|
35
|
-
token=os.getenv("DISCORD_TOKEN"),
|
|
36
|
-
channels=os.getenv("DISCORD_CHANNELS", default=[]).split(","),
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@pytest.mark.asyncio
|
|
41
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
42
|
-
@requires_env("DISCORD_TOKEN", "DISCORD_CHANNELS")
|
|
43
|
-
async def test_discord_source():
|
|
44
|
-
env = get_env_data()
|
|
45
|
-
indexer_config = DiscordIndexerConfig(channels=env.channels)
|
|
46
|
-
with tempfile.TemporaryDirectory() as tempdir:
|
|
47
|
-
tempdir_path = Path(tempdir)
|
|
48
|
-
connection_config = DiscordConnectionConfig(
|
|
49
|
-
access_config=DiscordAccessConfig(token=env.token)
|
|
50
|
-
)
|
|
51
|
-
download_config = DiscordDownloaderConfig(download_dir=tempdir_path)
|
|
52
|
-
indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
53
|
-
downloader = DiscordDownloader(
|
|
54
|
-
connection_config=connection_config, download_config=download_config
|
|
55
|
-
)
|
|
56
|
-
expected_num_files = len(env.channels)
|
|
57
|
-
await source_connector_validation(
|
|
58
|
-
indexer=indexer,
|
|
59
|
-
downloader=downloader,
|
|
60
|
-
configs=SourceValidationConfigs(
|
|
61
|
-
test_id=CONNECTOR_TYPE,
|
|
62
|
-
expected_num_files=expected_num_files,
|
|
63
|
-
expected_number_indexed_file_data=expected_num_files,
|
|
64
|
-
validate_downloaded_files=True,
|
|
65
|
-
),
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
70
|
-
@requires_env("DISCORD_CHANNELS")
|
|
71
|
-
def test_discord_source_precheck_fail_no_token():
|
|
72
|
-
indexer_config = DiscordIndexerConfig(channels=get_env_data().channels)
|
|
73
|
-
|
|
74
|
-
connection_config = DiscordConnectionConfig(access_config=DiscordAccessConfig(token=""))
|
|
75
|
-
indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
76
|
-
with pytest.raises(SourceConnectionError):
|
|
77
|
-
indexer.precheck()
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
@pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
|
|
81
|
-
@requires_env("DISCORD_TOKEN")
|
|
82
|
-
def test_discord_source_precheck_fail_no_channels():
|
|
83
|
-
indexer_config = DiscordIndexerConfig(channels=[])
|
|
84
|
-
|
|
85
|
-
connection_config = DiscordConnectionConfig(
|
|
86
|
-
access_config=DiscordAccessConfig(token=get_env_data().token)
|
|
87
|
-
)
|
|
88
|
-
indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
|
|
89
|
-
with pytest.raises(SourceConnectionError):
|
|
90
|
-
indexer.precheck()
|
|
File without changes
|