unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +2 -2
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
examples/singlestore.py DELETED
@@ -1,49 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.local import (
8
- LocalConnectionConfig,
9
- LocalDownloaderConfig,
10
- LocalIndexerConfig,
11
- )
12
- from unstructured_ingest.processes.connectors.singlestore import (
13
- CONNECTOR_TYPE,
14
- SingleStoreAccessConfig,
15
- SingleStoreConnectionConfig,
16
- SingleStoreUploaderConfig,
17
- SingleStoreUploadStagerConfig,
18
- )
19
- from unstructured_ingest.processes.embedder import EmbedderConfig
20
- from unstructured_ingest.processes.partitioner import PartitionerConfig
21
-
22
- base_path = Path(__file__).parent.parent.parent.parent
23
- docs_path = base_path / "example-docs"
24
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
- output_path = work_dir / "output"
26
- download_path = work_dir / "download"
27
-
28
- if __name__ == "__main__":
29
- logger.info(f"writing all content in: {work_dir.resolve()}")
30
- Pipeline.from_configs(
31
- context=ProcessorConfig(work_dir=str(work_dir.resolve()), tqdm=True, verbose=True),
32
- indexer_config=LocalIndexerConfig(
33
- input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
34
- ),
35
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
36
- source_connection_config=LocalConnectionConfig(),
37
- partitioner_config=PartitionerConfig(strategy="fast"),
38
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
39
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
40
- destination_connection_config=SingleStoreConnectionConfig(
41
- access_config=SingleStoreAccessConfig(password="password"),
42
- host="localhost",
43
- port=3306,
44
- database="ingest_test",
45
- user="root",
46
- ),
47
- stager_config=SingleStoreUploadStagerConfig(),
48
- uploader_config=SingleStoreUploaderConfig(table_name="elements"),
49
- ).run()
examples/sql.py DELETED
@@ -1,90 +0,0 @@
1
- import os
2
- import sqlite3
3
- from pathlib import Path
4
-
5
- from unstructured_ingest.interfaces import ProcessorConfig
6
- from unstructured_ingest.logger import logger
7
- from unstructured_ingest.pipeline.pipeline import Pipeline
8
- from unstructured_ingest.processes.chunker import ChunkerConfig
9
- from unstructured_ingest.processes.connectors.local import (
10
- LocalConnectionConfig,
11
- LocalDownloaderConfig,
12
- LocalIndexerConfig,
13
- )
14
- from unstructured_ingest.processes.connectors.sql import (
15
- CONNECTOR_TYPE,
16
- POSTGRESQL_DB,
17
- SQLITE_DB,
18
- SQLAccessConfig,
19
- SQLConnectionConfig,
20
- SQLUploaderConfig,
21
- SQLUploadStagerConfig,
22
- )
23
- from unstructured_ingest.processes.embedder import EmbedderConfig
24
- from unstructured_ingest.processes.partitioner import PartitionerConfig
25
-
26
- base_path = Path(__file__).parent.parent.parent.parent
27
- docs_path = base_path / "example-docs"
28
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
29
- output_path = work_dir / "output"
30
- download_path = work_dir / "download"
31
-
32
- SQLITE_DB_PATH = "test-sql-db.sqlite"
33
-
34
- if __name__ == "__main__":
35
- logger.info(f"writing all content in: {work_dir.resolve()}")
36
-
37
- configs = {
38
- "context": ProcessorConfig(work_dir=str(work_dir.resolve())),
39
- "indexer_config": LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
40
- "downloader_config": LocalDownloaderConfig(download_dir=download_path),
41
- "source_connection_config": LocalConnectionConfig(),
42
- "partitioner_config": PartitionerConfig(strategy="fast"),
43
- "chunker_config": ChunkerConfig(
44
- chunking_strategy="by_title",
45
- chunk_include_orig_elements=False,
46
- chunk_max_characters=1500,
47
- chunk_multipage_sections=True,
48
- ),
49
- "embedder_config": EmbedderConfig(embedding_provider="huggingface"),
50
- "stager_config": SQLUploadStagerConfig(),
51
- "uploader_config": SQLUploaderConfig(batch_size=10),
52
- }
53
-
54
- if os.path.exists(SQLITE_DB):
55
- os.remove(SQLITE_DB)
56
-
57
- connection = sqlite3.connect(database=SQLITE_DB)
58
-
59
- query = None
60
- script_path = (
61
- Path(__file__).parent.parent.parent.parent.parent
62
- / Path("test_e2e/env_setup/sql/sqlite-schema.sql")
63
- ).resolve()
64
- with open(script_path) as f:
65
- query = f.read()
66
- cursor = connection.cursor()
67
- cursor.executescript(query)
68
- connection.close()
69
-
70
- # sqlite test first
71
- Pipeline.from_configs(
72
- destination_connection_config=SQLConnectionConfig(
73
- db_type=SQLITE_DB,
74
- database=SQLITE_DB_PATH,
75
- access_config=SQLAccessConfig(),
76
- ),
77
- **configs,
78
- ).run()
79
-
80
- # now, pg with pgvector
81
- Pipeline.from_configs(
82
- destination_connection_config=SQLConnectionConfig(
83
- db_type=POSTGRESQL_DB,
84
- database="elements",
85
- host="localhost",
86
- port=5433,
87
- access_config=SQLAccessConfig(username="unstructured", password="test"),
88
- ),
89
- **configs,
90
- ).run()
examples/vectara.py DELETED
@@ -1,54 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.local import (
8
- LocalConnectionConfig,
9
- LocalDownloaderConfig,
10
- LocalIndexerConfig,
11
- )
12
- from unstructured_ingest.processes.connectors.vectara import (
13
- CONNECTOR_TYPE,
14
- VectaraAccessConfig,
15
- VectaraConnectionConfig,
16
- VectaraUploaderConfig,
17
- VectaraUploadStagerConfig,
18
- )
19
- from unstructured_ingest.processes.embedder import EmbedderConfig
20
- from unstructured_ingest.processes.partitioner import PartitionerConfig
21
-
22
- base_path = Path(__file__).parent.parent.parent.parent
23
- docs_path = base_path / "example-docs"
24
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
25
- output_path = work_dir / "output"
26
- download_path = work_dir / "download"
27
-
28
- if __name__ == "__main__":
29
- logger.info(f"writing all content in: {work_dir.resolve()}")
30
- Pipeline.from_configs(
31
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32
- indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
33
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
34
- source_connection_config=LocalConnectionConfig(),
35
- partitioner_config=PartitionerConfig(strategy="fast"),
36
- chunker_config=ChunkerConfig(
37
- chunking_strategy="by_title",
38
- chunk_include_orig_elements=False,
39
- chunk_max_characters=1500,
40
- chunk_multipage_sections=True,
41
- ),
42
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
43
- destination_connection_config=VectaraConnectionConfig(
44
- access_config=VectaraAccessConfig(
45
- oauth_client_id="fill oauth_client_id", oauth_secret="fill oauth_secret"
46
- ),
47
- customer_id="fill customer_id",
48
- corpus_name="fill corpus_name",
49
- corpus_key="fill corpus_key",
50
- token_url="fill token_url",
51
- ),
52
- stager_config=VectaraUploadStagerConfig(batch_size=10),
53
- uploader_config=VectaraUploaderConfig(),
54
- ).run()
examples/weaviate.py DELETED
@@ -1,44 +0,0 @@
1
- from pathlib import Path
2
-
3
- from unstructured_ingest.interfaces import ProcessorConfig
4
- from unstructured_ingest.logger import logger
5
- from unstructured_ingest.pipeline.pipeline import Pipeline
6
- from unstructured_ingest.processes.chunker import ChunkerConfig
7
- from unstructured_ingest.processes.connectors.local import (
8
- LocalConnectionConfig,
9
- LocalDownloaderConfig,
10
- LocalIndexerConfig,
11
- )
12
- from unstructured_ingest.processes.connectors.weaviate.local import (
13
- CONNECTOR_TYPE,
14
- LocalWeaviateConnectionConfig,
15
- LocalWeaviateUploaderConfig,
16
- LocalWeaviateUploadStagerConfig,
17
- )
18
- from unstructured_ingest.processes.embedder import EmbedderConfig
19
- from unstructured_ingest.processes.partitioner import PartitionerConfig
20
-
21
- base_path = Path(__file__).parent.parent.parent.parent
22
- docs_path = base_path / "example-docs"
23
- work_dir = base_path / "tmp_ingest" / CONNECTOR_TYPE
24
- output_path = work_dir / "output"
25
- download_path = work_dir / "download"
26
-
27
- if __name__ == "__main__":
28
- logger.info(f"writing all content in: {work_dir.resolve()}")
29
- Pipeline.from_configs(
30
- context=ProcessorConfig(work_dir=str(work_dir.resolve())),
31
- indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
32
- downloader_config=LocalDownloaderConfig(download_dir=download_path),
33
- source_connection_config=LocalConnectionConfig(),
34
- partitioner_config=PartitionerConfig(strategy="fast"),
35
- chunker_config=ChunkerConfig(chunking_strategy="by_title"),
36
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
37
- destination_connection_config=LocalWeaviateConnectionConfig(
38
- # Connects to http://localhost:8080
39
- ),
40
- stager_config=LocalWeaviateUploadStagerConfig(),
41
- uploader_config=LocalWeaviateUploaderConfig(
42
- collection="elements", batch_size=10, dynamic_batch=False
43
- ),
44
- ).run()
test/__init__.py DELETED
File without changes
File without changes
File without changes
@@ -1,31 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- import pytest
5
-
6
- from test.integration.utils import requires_env
7
- from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
8
-
9
- int_test_dir = Path(__file__).parent
10
- assets_dir = int_test_dir / "assets"
11
-
12
- chunker_files = [path for path in assets_dir.iterdir() if path.is_file()]
13
-
14
-
15
- @pytest.mark.parametrize("chunker_file", chunker_files, ids=[path.name for path in chunker_files])
16
- @pytest.mark.parametrize("strategy", ["basic", "by_title", "by_similarity", "by_page"])
17
- @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
18
- @pytest.mark.asyncio
19
- async def test_chunker_api(chunker_file: Path, strategy: str):
20
- api_key = os.getenv("UNSTRUCTURED_API_KEY")
21
- api_url = os.getenv("UNSTRUCTURED_API_URL")
22
-
23
- chunker_config = ChunkerConfig(
24
- chunking_strategy=strategy,
25
- chunk_by_api=True,
26
- chunk_api_key=api_key,
27
- chunking_endpoint=api_url,
28
- )
29
- chunker = Chunker(config=chunker_config)
30
- results = await chunker.run_async(elements_filepath=chunker_file)
31
- assert results
File without changes
@@ -1,38 +0,0 @@
1
- import tempfile
2
- from pathlib import Path
3
- from typing import Generator
4
-
5
- import pytest
6
-
7
- from unstructured_ingest.logger import logger
8
-
9
- FILENAME = Path("DA-1p-with-duplicate-pages.pdf.json")
10
-
11
-
12
- @pytest.fixture
13
- def upload_file() -> Path:
14
- int_test_dir = Path(__file__).parent
15
- assets_dir = int_test_dir / "assets"
16
- upload_file = assets_dir / FILENAME
17
- assert upload_file.exists()
18
- assert upload_file.is_file()
19
- return upload_file
20
-
21
-
22
- @pytest.fixture
23
- def upload_file_ndjson() -> Path:
24
- int_test_dir = Path(__file__).parent
25
- assets_dir = int_test_dir / "assets"
26
- upload_file = assets_dir / FILENAME.with_suffix(".ndjson")
27
- assert upload_file.exists()
28
- assert upload_file.is_file()
29
- return upload_file
30
-
31
-
32
- @pytest.fixture
33
- def temp_dir() -> Generator[Path, None, None]:
34
- with tempfile.TemporaryDirectory() as temp_dir:
35
- temp_path = Path(temp_dir)
36
- logger.info(f"Created temp dir '{temp_path}'")
37
- yield temp_path
38
- logger.info(f"Removing temp dir '{temp_path}'")
File without changes
@@ -1,273 +0,0 @@
1
- import json
2
- import os
3
- import uuid
4
- from contextlib import contextmanager
5
- from dataclasses import dataclass
6
- from pathlib import Path
7
- from unittest import mock
8
-
9
- import pytest
10
- from databricks.sdk import WorkspaceClient
11
- from databricks.sdk.errors.platform import NotFound
12
-
13
- from test.integration.connectors.utils.constants import (
14
- BLOB_STORAGE_TAG,
15
- DESTINATION_TAG,
16
- SOURCE_TAG,
17
- )
18
- from test.integration.connectors.utils.validation.source import (
19
- SourceValidationConfigs,
20
- source_connector_validation,
21
- )
22
- from test.integration.utils import requires_env
23
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
24
- from unstructured_ingest.errors_v2 import UserAuthError, UserError
25
- from unstructured_ingest.processes.connectors.databricks.volumes_native import (
26
- CONNECTOR_TYPE,
27
- DatabricksNativeVolumesAccessConfig,
28
- DatabricksNativeVolumesConnectionConfig,
29
- DatabricksNativeVolumesDownloader,
30
- DatabricksNativeVolumesDownloaderConfig,
31
- DatabricksNativeVolumesIndexer,
32
- DatabricksNativeVolumesIndexerConfig,
33
- DatabricksNativeVolumesUploader,
34
- DatabricksNativeVolumesUploaderConfig,
35
- )
36
-
37
-
38
- @dataclass
39
- class BaseEnvData:
40
- host: str
41
- catalog: str
42
-
43
-
44
- @dataclass
45
- class BasicAuthEnvData(BaseEnvData):
46
- client_id: str
47
- client_secret: str
48
-
49
- def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
50
- return DatabricksNativeVolumesConnectionConfig(
51
- host=self.host,
52
- access_config=DatabricksNativeVolumesAccessConfig(
53
- client_id=self.client_id,
54
- client_secret=self.client_secret,
55
- ),
56
- )
57
-
58
-
59
- @dataclass
60
- class PATEnvData(BaseEnvData):
61
- token: str
62
-
63
- def get_connection_config(self) -> DatabricksNativeVolumesConnectionConfig:
64
- return DatabricksNativeVolumesConnectionConfig(
65
- host=self.host,
66
- access_config=DatabricksNativeVolumesAccessConfig(
67
- token=self.token,
68
- ),
69
- )
70
-
71
-
72
- def get_basic_auth_env_data() -> BasicAuthEnvData:
73
- return BasicAuthEnvData(
74
- host=os.environ["DATABRICKS_HOST"],
75
- client_id=os.environ["DATABRICKS_CLIENT_ID"],
76
- client_secret=os.environ["DATABRICKS_CLIENT_SECRET"],
77
- catalog=os.environ["DATABRICKS_CATALOG"],
78
- )
79
-
80
-
81
- def get_pat_env_data() -> PATEnvData:
82
- return PATEnvData(
83
- host=os.environ["DATABRICKS_HOST"],
84
- catalog=os.environ["DATABRICKS_CATALOG"],
85
- token=os.environ["DATABRICKS_PAT"],
86
- )
87
-
88
-
89
- @pytest.mark.asyncio
90
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
91
- @requires_env(
92
- "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
93
- )
94
- async def test_volumes_native_source(tmp_path: Path):
95
- env_data = get_basic_auth_env_data()
96
- with mock.patch.dict(os.environ, clear=True):
97
- indexer_config = DatabricksNativeVolumesIndexerConfig(
98
- recursive=True,
99
- volume="test-platform",
100
- volume_path="databricks-volumes-test-input",
101
- catalog=env_data.catalog,
102
- )
103
- connection_config = env_data.get_connection_config()
104
- download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
105
- indexer = DatabricksNativeVolumesIndexer(
106
- connection_config=connection_config, index_config=indexer_config
107
- )
108
- downloader = DatabricksNativeVolumesDownloader(
109
- connection_config=connection_config, download_config=download_config
110
- )
111
- await source_connector_validation(
112
- indexer=indexer,
113
- downloader=downloader,
114
- configs=SourceValidationConfigs(
115
- test_id="databricks_volumes_native",
116
- expected_num_files=1,
117
- ),
118
- )
119
-
120
-
121
- @pytest.mark.asyncio
122
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
123
- @requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
124
- async def test_volumes_native_source_pat(tmp_path: Path):
125
- env_data = get_pat_env_data()
126
- with mock.patch.dict(os.environ, clear=True):
127
- indexer_config = DatabricksNativeVolumesIndexerConfig(
128
- recursive=True,
129
- volume="test-platform",
130
- volume_path="databricks-volumes-test-input",
131
- catalog=env_data.catalog,
132
- )
133
- connection_config = env_data.get_connection_config()
134
- download_config = DatabricksNativeVolumesDownloaderConfig(download_dir=tmp_path)
135
- indexer = DatabricksNativeVolumesIndexer(
136
- connection_config=connection_config, index_config=indexer_config
137
- )
138
- downloader = DatabricksNativeVolumesDownloader(
139
- connection_config=connection_config, download_config=download_config
140
- )
141
- await source_connector_validation(
142
- indexer=indexer,
143
- downloader=downloader,
144
- configs=SourceValidationConfigs(
145
- test_id="databricks_volumes_native_pat",
146
- expected_num_files=1,
147
- ),
148
- )
149
-
150
-
151
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
152
- @requires_env("DATABRICKS_HOST", "DATABRICKS_PAT", "DATABRICKS_CATALOG")
153
- def test_volumes_native_source_pat_invalid_catalog():
154
- env_data = get_pat_env_data()
155
- with mock.patch.dict(os.environ, clear=True):
156
- indexer_config = DatabricksNativeVolumesIndexerConfig(
157
- recursive=True,
158
- volume="test-platform",
159
- volume_path="databricks-volumes-test-input",
160
- catalog="fake_catalog",
161
- )
162
- indexer = DatabricksNativeVolumesIndexer(
163
- connection_config=env_data.get_connection_config(), index_config=indexer_config
164
- )
165
- with pytest.raises(UserError):
166
- _ = list(indexer.run())
167
-
168
-
169
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, BLOB_STORAGE_TAG)
170
- @requires_env("DATABRICKS_HOST")
171
- def test_volumes_native_source_pat_invalid_pat():
172
- host = os.environ["DATABRICKS_HOST"]
173
- with mock.patch.dict(os.environ, clear=True):
174
- indexer_config = DatabricksNativeVolumesIndexerConfig(
175
- recursive=True,
176
- volume="test-platform",
177
- volume_path="databricks-volumes-test-input",
178
- catalog="fake_catalog",
179
- )
180
- connection_config = DatabricksNativeVolumesConnectionConfig(
181
- host=host,
182
- access_config=DatabricksNativeVolumesAccessConfig(
183
- token="invalid-token",
184
- ),
185
- )
186
- indexer = DatabricksNativeVolumesIndexer(
187
- connection_config=connection_config, index_config=indexer_config
188
- )
189
- with pytest.raises(UserAuthError):
190
- _ = list(indexer.run())
191
-
192
-
193
- def _get_volume_path(catalog: str, volume: str, volume_path: str):
194
- return f"/Volumes/{catalog}/default/{volume}/{volume_path}"
195
-
196
-
197
- @contextmanager
198
- def databricks_destination_context(
199
- env_data: BasicAuthEnvData, volume: str, volume_path
200
- ) -> WorkspaceClient:
201
- client = WorkspaceClient(
202
- host=env_data.host, client_id=env_data.client_id, client_secret=env_data.client_secret
203
- )
204
- try:
205
- yield client
206
- finally:
207
- # Cleanup
208
- try:
209
- for file in client.files.list_directory_contents(
210
- directory_path=_get_volume_path(env_data.catalog, volume, volume_path)
211
- ):
212
- client.files.delete(file.path)
213
- client.files.delete_directory(_get_volume_path(env_data.catalog, volume, volume_path))
214
- except NotFound:
215
- # Directory was never created, don't need to delete
216
- pass
217
-
218
-
219
- def validate_upload(client: WorkspaceClient, catalog: str, volume: str, volume_path: str):
220
- files = list(
221
- client.files.list_directory_contents(
222
- directory_path=_get_volume_path(catalog, volume, volume_path)
223
- )
224
- )
225
-
226
- assert len(files) == 1
227
-
228
- resp = client.files.download(files[0].path)
229
- data = json.loads(resp.contents.read())
230
-
231
- assert len(data) == 22
232
- element_types = {v["type"] for v in data}
233
- assert len(element_types) == 1
234
- assert "CompositeElement" in element_types
235
-
236
-
237
- @pytest.mark.asyncio
238
- @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, BLOB_STORAGE_TAG)
239
- @requires_env(
240
- "DATABRICKS_HOST", "DATABRICKS_CLIENT_ID", "DATABRICKS_CLIENT_SECRET", "DATABRICKS_CATALOG"
241
- )
242
- async def test_volumes_native_destination(upload_file: Path):
243
- env_data = get_basic_auth_env_data()
244
- volume_path = f"databricks-volumes-test-output-{uuid.uuid4()}"
245
- file_data = FileData(
246
- source_identifiers=SourceIdentifiers(fullpath=upload_file.name, filename=upload_file.name),
247
- connector_type=CONNECTOR_TYPE,
248
- identifier="mock file data",
249
- )
250
- with databricks_destination_context(
251
- volume="test-platform", volume_path=volume_path, env_data=env_data
252
- ) as workspace_client:
253
- connection_config = env_data.get_connection_config()
254
- uploader = DatabricksNativeVolumesUploader(
255
- connection_config=connection_config,
256
- upload_config=DatabricksNativeVolumesUploaderConfig(
257
- volume="test-platform",
258
- volume_path=volume_path,
259
- catalog=env_data.catalog,
260
- ),
261
- )
262
- uploader.precheck()
263
- if uploader.is_async():
264
- await uploader.run_async(path=upload_file, file_data=file_data)
265
- else:
266
- uploader.run(path=upload_file, file_data=file_data)
267
-
268
- validate_upload(
269
- client=workspace_client,
270
- catalog=env_data.catalog,
271
- volume="test-platform",
272
- volume_path=volume_path,
273
- )
File without changes
@@ -1,90 +0,0 @@
1
- import os
2
- import tempfile
3
- from dataclasses import dataclass
4
- from pathlib import Path
5
- from typing import Optional
6
-
7
- import pytest
8
-
9
- from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
10
- from test.integration.connectors.utils.validation.source import (
11
- SourceValidationConfigs,
12
- source_connector_validation,
13
- )
14
- from test.integration.utils import requires_env
15
- from unstructured_ingest.error import SourceConnectionError
16
- from unstructured_ingest.processes.connectors.discord import (
17
- CONNECTOR_TYPE,
18
- DiscordAccessConfig,
19
- DiscordConnectionConfig,
20
- DiscordDownloader,
21
- DiscordDownloaderConfig,
22
- DiscordIndexer,
23
- DiscordIndexerConfig,
24
- )
25
-
26
-
27
- @dataclass(frozen=True)
28
- class EnvData:
29
- token: Optional[str]
30
- channels: Optional[list[str]]
31
-
32
-
33
- def get_env_data() -> EnvData:
34
- return EnvData(
35
- token=os.getenv("DISCORD_TOKEN"),
36
- channels=os.getenv("DISCORD_CHANNELS", default=[]).split(","),
37
- )
38
-
39
-
40
- @pytest.mark.asyncio
41
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
42
- @requires_env("DISCORD_TOKEN", "DISCORD_CHANNELS")
43
- async def test_discord_source():
44
- env = get_env_data()
45
- indexer_config = DiscordIndexerConfig(channels=env.channels)
46
- with tempfile.TemporaryDirectory() as tempdir:
47
- tempdir_path = Path(tempdir)
48
- connection_config = DiscordConnectionConfig(
49
- access_config=DiscordAccessConfig(token=env.token)
50
- )
51
- download_config = DiscordDownloaderConfig(download_dir=tempdir_path)
52
- indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
53
- downloader = DiscordDownloader(
54
- connection_config=connection_config, download_config=download_config
55
- )
56
- expected_num_files = len(env.channels)
57
- await source_connector_validation(
58
- indexer=indexer,
59
- downloader=downloader,
60
- configs=SourceValidationConfigs(
61
- test_id=CONNECTOR_TYPE,
62
- expected_num_files=expected_num_files,
63
- expected_number_indexed_file_data=expected_num_files,
64
- validate_downloaded_files=True,
65
- ),
66
- )
67
-
68
-
69
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
70
- @requires_env("DISCORD_CHANNELS")
71
- def test_discord_source_precheck_fail_no_token():
72
- indexer_config = DiscordIndexerConfig(channels=get_env_data().channels)
73
-
74
- connection_config = DiscordConnectionConfig(access_config=DiscordAccessConfig(token=""))
75
- indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
76
- with pytest.raises(SourceConnectionError):
77
- indexer.precheck()
78
-
79
-
80
- @pytest.mark.tags(CONNECTOR_TYPE, SOURCE_TAG, UNCATEGORIZED_TAG)
81
- @requires_env("DISCORD_TOKEN")
82
- def test_discord_source_precheck_fail_no_channels():
83
- indexer_config = DiscordIndexerConfig(channels=[])
84
-
85
- connection_config = DiscordConnectionConfig(
86
- access_config=DiscordAccessConfig(token=get_env_data().token)
87
- )
88
- indexer = DiscordIndexer(connection_config=connection_config, index_config=indexer_config)
89
- with pytest.raises(SourceConnectionError):
90
- indexer.precheck()
File without changes