unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +48 -34
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,48 +0,0 @@
1
- import random
2
- from typing import Any
3
-
4
- import faker
5
- import pytest
6
-
7
- from test.unit.utils.data_generator import generate_random_dictionary
8
- from unstructured_ingest.embed.huggingface import (
9
- HuggingFaceEmbeddingConfig,
10
- HuggingFaceEmbeddingEncoder,
11
- )
12
-
13
- fake = faker.Faker()
14
-
15
-
16
- def generate_embedder_config_params() -> dict:
17
- params = {}
18
- if random.random() < 0.5:
19
- params["embedder_model_name"] = fake.word() if random.random() < 0.5 else None
20
- params["embedder_model_kwargs"] = (
21
- generate_random_dictionary(key_type=str, value_type=Any)
22
- if random.random() < 0.5
23
- else None
24
- )
25
- params["encode_kwargs"] = (
26
- generate_random_dictionary(key_type=str, value_type=Any)
27
- if random.random() < 0.5
28
- else None
29
- )
30
- params["cache_folder"] = fake.file_path() if random.random() < 0.5 else None
31
- return params
32
-
33
-
34
- @pytest.mark.parametrize(
35
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
36
- )
37
- def test_embedder_config(embedder_config_params: dict):
38
- embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
39
- assert embedder_config
40
-
41
-
42
- @pytest.mark.parametrize(
43
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
44
- )
45
- def test_embedder(embedder_config_params: dict):
46
- embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
47
- embedder = HuggingFaceEmbeddingEncoder(config=embedder_config)
48
- assert embedder
@@ -1,37 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.embed.mixedbreadai import (
7
- MixedbreadAIEmbeddingConfig,
8
- MixedbreadAIEmbeddingEncoder,
9
- )
10
-
11
- fake = faker.Faker()
12
-
13
-
14
- def generate_embedder_config_params() -> dict:
15
- params = {
16
- "api_key": fake.password(),
17
- }
18
- if random.random() < 0.5:
19
- params["embedder_model_name"] = fake.word()
20
- return params
21
-
22
-
23
- @pytest.mark.parametrize(
24
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
- )
26
- def test_embedder_config(embedder_config_params: dict):
27
- embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
28
- assert embedder_config
29
-
30
-
31
- @pytest.mark.parametrize(
32
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
- )
34
- def test_embedder(embedder_config_params: dict):
35
- embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
36
- embedder = MixedbreadAIEmbeddingEncoder(config=embedder_config)
37
- assert embedder
@@ -1,35 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
7
-
8
- fake = faker.Faker()
9
-
10
-
11
- def generate_embedder_config_params() -> dict:
12
- params = {
13
- "api_key": fake.password(),
14
- }
15
- if random.random() < 0.5:
16
- params["embedder_model_name"] = fake.word()
17
- params["base_url"] = fake.url()
18
- return params
19
-
20
-
21
- @pytest.mark.parametrize(
22
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
23
- )
24
- def test_embedder_config(embedder_config_params: dict):
25
- embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
26
- assert embedder_config
27
-
28
-
29
- @pytest.mark.parametrize(
30
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
31
- )
32
- def test_embedder(embedder_config_params: dict):
33
- embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
34
- embedder = OctoAIEmbeddingEncoder(config=embedder_config)
35
- assert embedder
@@ -1,35 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
7
-
8
- fake = faker.Faker()
9
-
10
-
11
- def generate_embedder_config_params() -> dict:
12
- params = {
13
- "api_key": fake.password(),
14
- }
15
- if random.random() < 0.5:
16
- params["embedder_model_name"] = fake.word()
17
- params["base_url"] = fake.url()
18
- return params
19
-
20
-
21
- @pytest.mark.parametrize(
22
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
23
- )
24
- def test_embedder_config(embedder_config_params: dict):
25
- embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
26
- assert embedder_config
27
-
28
-
29
- @pytest.mark.parametrize(
30
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
31
- )
32
- def test_embedder(embedder_config_params: dict):
33
- embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
34
- embedder = OpenAIEmbeddingEncoder(config=embedder_config)
35
- assert embedder
@@ -1,37 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.embed.togetherai import (
7
- TogetherAIEmbeddingConfig,
8
- TogetherAIEmbeddingEncoder,
9
- )
10
-
11
- fake = faker.Faker()
12
-
13
-
14
- def generate_embedder_config_params() -> dict:
15
- params = {
16
- "api_key": fake.password(),
17
- }
18
- if random.random() < 0.5:
19
- params["embedder_model_name"] = fake.word()
20
- return params
21
-
22
-
23
- @pytest.mark.parametrize(
24
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
- )
26
- def test_embedder_config(embedder_config_params: dict):
27
- embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
28
- assert embedder_config
29
-
30
-
31
- @pytest.mark.parametrize(
32
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
- )
34
- def test_embedder(embedder_config_params: dict):
35
- embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
36
- embedder = TogetherAIEmbeddingEncoder(config=embedder_config)
37
- assert embedder
@@ -1,37 +0,0 @@
1
- import json
2
- import random
3
- from typing import Any
4
-
5
- import faker
6
- import pytest
7
-
8
- from test.unit.utils.data_generator import generate_random_dictionary
9
- from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
10
-
11
- fake = faker.Faker()
12
-
13
-
14
- def generate_embedder_config_params() -> dict:
15
- params = {
16
- "api_key": json.dumps(generate_random_dictionary(key_type=str, value_type=Any)),
17
- }
18
- if random.random() < 0.5:
19
- params["embedder_model_name"] = fake.word()
20
- return params
21
-
22
-
23
- @pytest.mark.parametrize(
24
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
- )
26
- def test_embedder_config(embedder_config_params: dict):
27
- embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
28
- assert embedder_config
29
-
30
-
31
- @pytest.mark.parametrize(
32
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
- )
34
- def test_embedder(embedder_config_params: dict):
35
- embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
36
- embedder = VertexAIEmbeddingEncoder(config=embedder_config)
37
- assert embedder
@@ -1,38 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
7
-
8
- fake = faker.Faker()
9
-
10
-
11
- def generate_embedder_config_params() -> dict:
12
- params = {
13
- "api_key": fake.password(),
14
- }
15
- if random.random() < 0.5:
16
- params["embedder_model_name"] = fake.word()
17
- params["batch_size"] = fake.random_int(max=100)
18
- params["truncation"] = fake.boolean()
19
- params["max_retries"] = fake.random_int()
20
- params["timeout_in_seconds"] = fake.random_int()
21
- return params
22
-
23
-
24
- @pytest.mark.parametrize(
25
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
26
- )
27
- def test_embedder_config(embedder_config_params: dict):
28
- embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
29
- assert embedder_config
30
-
31
-
32
- @pytest.mark.parametrize(
33
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
34
- )
35
- def test_embedder(embedder_config_params: dict):
36
- embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
37
- embedder = VoyageAIEmbeddingEncoder(config=embedder_config)
38
- assert embedder
File without changes
@@ -1,63 +0,0 @@
1
- import random
2
- from typing import Any
3
-
4
- import faker
5
- import pytest
6
-
7
- from test.unit.utils.data_generator import generate_random_dictionary
8
- from unstructured_ingest.processes.partitioner import Partitioner, PartitionerConfig
9
-
10
- fake = faker.Faker()
11
-
12
-
13
- def generate_partitioner_config_params() -> dict:
14
- params = {
15
- "strategy": random.choice(["fast", "hi_res", "auto"]),
16
- "ocr_languages": fake.words() if random.random() < 0.5 else None,
17
- "encoding": fake.word() if random.random() < 0.5 else None,
18
- "additional_partition_args": (
19
- generate_random_dictionary(key_type=str, value_type=Any)
20
- if random.random() < 0.5
21
- else None
22
- ),
23
- "skip_infer_table_types": fake.words() if random.random() < 0.5 else None,
24
- "flatten_metadata": fake.boolean(),
25
- "hi_res_model_name": fake.word() if random.random() < 0.5 else None,
26
- }
27
- random_val = random.random()
28
- # Randomly set the fields_include to a random list[str]
29
- if random_val < 0.5:
30
- params["fields_include"] = fake.words()
31
-
32
- # Randomly set the metadata_exclude or metadata_include to a valid
33
- # list[str] or don't set it at all
34
- if random.random() < (1 / 3):
35
- params["metadata_exclude"] = fake.words()
36
- elif random_val < (2 / 3):
37
- params["metadata_include"] = fake.words()
38
-
39
- # Randomly set the values associated with calling the api, or not at all
40
- if random.random() < 0.5:
41
- params["partition_by_api"]: True
42
- params["partition_endpoint"] = fake.url()
43
- params["api_key"] = fake.password()
44
- else:
45
- params["partition_by_api"]: False
46
- return params
47
-
48
-
49
- @pytest.mark.parametrize(
50
- "partition_config_params", [generate_partitioner_config_params() for i in range(10)]
51
- )
52
- def test_partition_config(partition_config_params: dict):
53
- partition_config = PartitionerConfig.model_validate(partition_config_params)
54
- assert partition_config
55
-
56
-
57
- @pytest.mark.parametrize(
58
- "partition_config_params", [generate_partitioner_config_params() for i in range(10)]
59
- )
60
- def test_partitioner(partition_config_params: dict):
61
- partition_config = PartitionerConfig.model_validate(partition_config_params)
62
- partitioner = Partitioner(config=partition_config)
63
- assert partitioner
test/unit/test_error.py DELETED
@@ -1,27 +0,0 @@
1
- import pytest
2
-
3
- from unstructured_ingest.error import (
4
- DestinationConnectionError,
5
- PartitionError,
6
- SourceConnectionError,
7
- )
8
-
9
-
10
- @pytest.mark.parametrize(
11
- ("error_class", "exception_type", "error_message"),
12
- [
13
- (SourceConnectionError, ValueError, "Simulated connection error"),
14
- (DestinationConnectionError, RuntimeError, "Simulated connection error"),
15
- (PartitionError, FileNotFoundError, "Simulated partition error"),
16
- ],
17
- )
18
- def test_custom_error_decorator(error_class, exception_type, error_message):
19
- @error_class.wrap
20
- def simulate_error():
21
- raise exception_type(error_message)
22
-
23
- with pytest.raises(error_class) as context:
24
- simulate_error()
25
-
26
- expected_error_string = error_class.error_string.format(error_message)
27
- assert str(context.value) == expected_error_string
test/unit/test_html.py DELETED
@@ -1,112 +0,0 @@
1
- import base64
2
- from pathlib import Path
3
-
4
- from bs4 import BeautifulSoup
5
- from pytest_mock import MockerFixture
6
-
7
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
8
- from unstructured_ingest.utils.html import HtmlMixin
9
-
10
-
11
- def test_extract_images(mocker: MockerFixture):
12
- mixin = HtmlMixin(extract_images=True)
13
- mock_download_response = b"DOWNLOADED"
14
- expected_image_src = base64.b64encode(mock_download_response).decode()
15
- mocked_download_response = mocker.patch(
16
- "unstructured_ingest.utils.html.HtmlMixin.download_content",
17
- return_value=mock_download_response,
18
- )
19
- url = "http://mywebsite.com/path/to/page"
20
- html = """
21
- <img src="http://mywebsite.com/img1.jpg"/>
22
- <img src="http://notmywebsite.com/img2.jpg"/>
23
- <img src="img3.jpg"/>
24
- <img src="..."/>
25
- """
26
- expected_html = f"""
27
- <img src="data:image/png;base64,{expected_image_src}"/>
28
- <img src="http://notmywebsite.com/img2.jpg"/>
29
- <img src="data:image/png;base64,{expected_image_src}"/>
30
- <img src="..."/>
31
- """
32
- expected_soup = BeautifulSoup(expected_html, "html.parser")
33
- result = mixin.extract_html_images(url=url, html=html)
34
- result_soup = BeautifulSoup(result, "html.parser")
35
- assert expected_soup == result_soup
36
- assert mocked_download_response.call_count == 2
37
- urls_to_download = [
38
- call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
39
- ]
40
- assert urls_to_download == ["http://mywebsite.com/img1.jpg", "http://mywebsite.com/img3.jpg"]
41
-
42
-
43
- def test_extract_images_allow_list(mocker: MockerFixture):
44
- mixin = HtmlMixin(
45
- extract_images=True, allow_list=["http://allowedwebsite1.com", "http://allowedwebsite2.com"]
46
- )
47
- mock_download_response = b"DOWNLOADED"
48
- expected_image_src = base64.b64encode(mock_download_response).decode()
49
- mocked_download_response = mocker.patch(
50
- "unstructured_ingest.utils.html.HtmlMixin.download_content",
51
- return_value=mock_download_response,
52
- )
53
- url = "http://mywebsite.com/path/to/page"
54
- html = """
55
- <img src="http://mywebsite.com/img1.jpg"/>
56
- <img src="http://notmywebsite.com/img2.jpg"/>
57
- <img src="http://allowedwebsite1.com/img2.jpg"/>
58
- <img src="http://allowedwebsite2.com/img2.jpg"/>
59
- """
60
-
61
- expected_html = f"""
62
- <img src="http://mywebsite.com/img1.jpg"/>
63
- <img src="http://notmywebsite.com/img2.jpg"/>
64
- <img src="data:image/png;base64,{expected_image_src}"/>
65
- <img src="data:image/png;base64,{expected_image_src}"/>
66
- """
67
- expected_soup = BeautifulSoup(expected_html, "html.parser")
68
- result = mixin.extract_html_images(url=url, html=html)
69
- result_soup = BeautifulSoup(result, "html.parser")
70
- assert expected_soup == result_soup
71
- assert mocked_download_response.call_count == 2
72
- urls_to_download = [
73
- call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
74
- ]
75
- assert urls_to_download == [
76
- "http://allowedwebsite1.com/img2.jpg",
77
- "http://allowedwebsite2.com/img2.jpg",
78
- ]
79
-
80
-
81
- def test_extract_embedded_docs(mocker: MockerFixture):
82
- mixin = HtmlMixin(extract_files=True)
83
- mock_download_response = b"DOWNLOADED"
84
- mocked_download_response = mocker.patch(
85
- "unstructured_ingest.utils.html.HtmlMixin.download_content",
86
- return_value=mock_download_response,
87
- )
88
- mocked_write_content = mocker.patch("unstructured_ingest.utils.html.HtmlMixin.write_content")
89
- url = "http://mywebsite.com/path/to/page"
90
- html = """
91
- <a href="http://mywebsite.com/file.pdf"/>
92
- <a href="http://notmywebsite.com/file.pdf"/>
93
- <a href="http://mywebsite.com/another/link"/>
94
- <a href="another/link/2"/>
95
- <a href="file.doc"/>
96
- """
97
- file_data = FileData(
98
- source_identifiers=SourceIdentifiers(
99
- fullpath="file.txt",
100
- filename="file.txt",
101
- ),
102
- connector_type="my_connector",
103
- identifier="mock_file_data",
104
- )
105
- results = mixin.extract_embedded_files(
106
- url=url, html=html, download_dir=Path("/tmp/download/location"), original_filedata=file_data
107
- )
108
- assert len(results) == 2
109
- downloaded_urls = [r["file_data"].metadata.url for r in results]
110
- assert downloaded_urls == ["http://mywebsite.com/file.pdf", "http://mywebsite.com/file.doc"]
111
- assert mocked_download_response.call_count == 2
112
- assert mocked_write_content.call_count == 2
@@ -1,26 +0,0 @@
1
- import pytest
2
- from pydantic import Secret, ValidationError
3
-
4
- from unstructured_ingest.interfaces import AccessConfig, ConnectionConfig
5
-
6
-
7
- def test_failing_connection_config():
8
- class MyAccessConfig(AccessConfig):
9
- sensitive_value: str
10
-
11
- class MyConnectionConfig(ConnectionConfig):
12
- access_config: MyAccessConfig
13
-
14
- with pytest.raises(ValidationError):
15
- MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
16
-
17
-
18
- def test_happy_path_connection_config():
19
- class MyAccessConfig(AccessConfig):
20
- sensitive_value: str
21
-
22
- class MyConnectionConfig(ConnectionConfig):
23
- access_config: Secret[MyAccessConfig]
24
-
25
- connection_config = MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
26
- assert connection_config