unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,48 +0,0 @@
1
- import random
2
- from typing import Any
3
-
4
- import faker
5
- import pytest
6
-
7
- from test.unit.utils.data_generator import generate_random_dictionary
8
- from unstructured_ingest.embed.huggingface import (
9
- HuggingFaceEmbeddingConfig,
10
- HuggingFaceEmbeddingEncoder,
11
- )
12
-
13
- fake = faker.Faker()
14
-
15
-
16
- def generate_embedder_config_params() -> dict:
17
- params = {}
18
- if random.random() < 0.5:
19
- params["embedder_model_name"] = fake.word() if random.random() < 0.5 else None
20
- params["embedder_model_kwargs"] = (
21
- generate_random_dictionary(key_type=str, value_type=Any)
22
- if random.random() < 0.5
23
- else None
24
- )
25
- params["encode_kwargs"] = (
26
- generate_random_dictionary(key_type=str, value_type=Any)
27
- if random.random() < 0.5
28
- else None
29
- )
30
- params["cache_folder"] = fake.file_path() if random.random() < 0.5 else None
31
- return params
32
-
33
-
34
- @pytest.mark.parametrize(
35
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
36
- )
37
- def test_embedder_config(embedder_config_params: dict):
38
- embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
39
- assert embedder_config
40
-
41
-
42
- @pytest.mark.parametrize(
43
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
44
- )
45
- def test_embedder(embedder_config_params: dict):
46
- embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
47
- embedder = HuggingFaceEmbeddingEncoder(config=embedder_config)
48
- assert embedder
@@ -1,37 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.embed.mixedbreadai import (
7
- MixedbreadAIEmbeddingConfig,
8
- MixedbreadAIEmbeddingEncoder,
9
- )
10
-
11
- fake = faker.Faker()
12
-
13
-
14
- def generate_embedder_config_params() -> dict:
15
- params = {
16
- "api_key": fake.password(),
17
- }
18
- if random.random() < 0.5:
19
- params["embedder_model_name"] = fake.word()
20
- return params
21
-
22
-
23
- @pytest.mark.parametrize(
24
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
- )
26
- def test_embedder_config(embedder_config_params: dict):
27
- embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
28
- assert embedder_config
29
-
30
-
31
- @pytest.mark.parametrize(
32
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
- )
34
- def test_embedder(embedder_config_params: dict):
35
- embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
36
- embedder = MixedbreadAIEmbeddingEncoder(config=embedder_config)
37
- assert embedder
@@ -1,35 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
7
-
8
- fake = faker.Faker()
9
-
10
-
11
- def generate_embedder_config_params() -> dict:
12
- params = {
13
- "api_key": fake.password(),
14
- }
15
- if random.random() < 0.5:
16
- params["embedder_model_name"] = fake.word()
17
- params["base_url"] = fake.url()
18
- return params
19
-
20
-
21
- @pytest.mark.parametrize(
22
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
23
- )
24
- def test_embedder_config(embedder_config_params: dict):
25
- embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
26
- assert embedder_config
27
-
28
-
29
- @pytest.mark.parametrize(
30
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
31
- )
32
- def test_embedder(embedder_config_params: dict):
33
- embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
34
- embedder = OctoAIEmbeddingEncoder(config=embedder_config)
35
- assert embedder
@@ -1,35 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
7
-
8
- fake = faker.Faker()
9
-
10
-
11
- def generate_embedder_config_params() -> dict:
12
- params = {
13
- "api_key": fake.password(),
14
- }
15
- if random.random() < 0.5:
16
- params["embedder_model_name"] = fake.word()
17
- params["base_url"] = fake.url()
18
- return params
19
-
20
-
21
- @pytest.mark.parametrize(
22
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
23
- )
24
- def test_embedder_config(embedder_config_params: dict):
25
- embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
26
- assert embedder_config
27
-
28
-
29
- @pytest.mark.parametrize(
30
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
31
- )
32
- def test_embedder(embedder_config_params: dict):
33
- embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
34
- embedder = OpenAIEmbeddingEncoder(config=embedder_config)
35
- assert embedder
@@ -1,37 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.embed.togetherai import (
7
- TogetherAIEmbeddingConfig,
8
- TogetherAIEmbeddingEncoder,
9
- )
10
-
11
- fake = faker.Faker()
12
-
13
-
14
- def generate_embedder_config_params() -> dict:
15
- params = {
16
- "api_key": fake.password(),
17
- }
18
- if random.random() < 0.5:
19
- params["embedder_model_name"] = fake.word()
20
- return params
21
-
22
-
23
- @pytest.mark.parametrize(
24
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
- )
26
- def test_embedder_config(embedder_config_params: dict):
27
- embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
28
- assert embedder_config
29
-
30
-
31
- @pytest.mark.parametrize(
32
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
- )
34
- def test_embedder(embedder_config_params: dict):
35
- embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
36
- embedder = TogetherAIEmbeddingEncoder(config=embedder_config)
37
- assert embedder
@@ -1,37 +0,0 @@
1
- import json
2
- import random
3
- from typing import Any
4
-
5
- import faker
6
- import pytest
7
-
8
- from test.unit.utils.data_generator import generate_random_dictionary
9
- from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
10
-
11
- fake = faker.Faker()
12
-
13
-
14
- def generate_embedder_config_params() -> dict:
15
- params = {
16
- "api_key": json.dumps(generate_random_dictionary(key_type=str, value_type=Any)),
17
- }
18
- if random.random() < 0.5:
19
- params["embedder_model_name"] = fake.word()
20
- return params
21
-
22
-
23
- @pytest.mark.parametrize(
24
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
- )
26
- def test_embedder_config(embedder_config_params: dict):
27
- embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
28
- assert embedder_config
29
-
30
-
31
- @pytest.mark.parametrize(
32
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
- )
34
- def test_embedder(embedder_config_params: dict):
35
- embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
36
- embedder = VertexAIEmbeddingEncoder(config=embedder_config)
37
- assert embedder
@@ -1,38 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
7
-
8
- fake = faker.Faker()
9
-
10
-
11
- def generate_embedder_config_params() -> dict:
12
- params = {
13
- "api_key": fake.password(),
14
- }
15
- if random.random() < 0.5:
16
- params["embedder_model_name"] = fake.word()
17
- params["batch_size"] = fake.random_int(max=100)
18
- params["truncation"] = fake.boolean()
19
- params["max_retries"] = fake.random_int()
20
- params["timeout_in_seconds"] = fake.random_int()
21
- return params
22
-
23
-
24
- @pytest.mark.parametrize(
25
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
26
- )
27
- def test_embedder_config(embedder_config_params: dict):
28
- embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
29
- assert embedder_config
30
-
31
-
32
- @pytest.mark.parametrize(
33
- "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
34
- )
35
- def test_embedder(embedder_config_params: dict):
36
- embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
37
- embedder = VoyageAIEmbeddingEncoder(config=embedder_config)
38
- assert embedder
File without changes
@@ -1,63 +0,0 @@
1
- import random
2
- from typing import Any
3
-
4
- import faker
5
- import pytest
6
-
7
- from test.unit.utils.data_generator import generate_random_dictionary
8
- from unstructured_ingest.processes.partitioner import Partitioner, PartitionerConfig
9
-
10
- fake = faker.Faker()
11
-
12
-
13
- def generate_partitioner_config_params() -> dict:
14
- params = {
15
- "strategy": random.choice(["fast", "hi_res", "auto"]),
16
- "ocr_languages": fake.words() if random.random() < 0.5 else None,
17
- "encoding": fake.word() if random.random() < 0.5 else None,
18
- "additional_partition_args": (
19
- generate_random_dictionary(key_type=str, value_type=Any)
20
- if random.random() < 0.5
21
- else None
22
- ),
23
- "skip_infer_table_types": fake.words() if random.random() < 0.5 else None,
24
- "flatten_metadata": fake.boolean(),
25
- "hi_res_model_name": fake.word() if random.random() < 0.5 else None,
26
- }
27
- random_val = random.random()
28
- # Randomly set the fields_include to a random list[str]
29
- if random_val < 0.5:
30
- params["fields_include"] = fake.words()
31
-
32
- # Randomly set the metadata_exclude or metadata_include to a valid
33
- # list[str] or don't set it at all
34
- if random.random() < (1 / 3):
35
- params["metadata_exclude"] = fake.words()
36
- elif random_val < (2 / 3):
37
- params["metadata_include"] = fake.words()
38
-
39
- # Randomly set the values associated with calling the api, or not at all
40
- if random.random() < 0.5:
41
- params["partition_by_api"]: True
42
- params["partition_endpoint"] = fake.url()
43
- params["api_key"] = fake.password()
44
- else:
45
- params["partition_by_api"]: False
46
- return params
47
-
48
-
49
- @pytest.mark.parametrize(
50
- "partition_config_params", [generate_partitioner_config_params() for i in range(10)]
51
- )
52
- def test_partition_config(partition_config_params: dict):
53
- partition_config = PartitionerConfig.model_validate(partition_config_params)
54
- assert partition_config
55
-
56
-
57
- @pytest.mark.parametrize(
58
- "partition_config_params", [generate_partitioner_config_params() for i in range(10)]
59
- )
60
- def test_partitioner(partition_config_params: dict):
61
- partition_config = PartitionerConfig.model_validate(partition_config_params)
62
- partitioner = Partitioner(config=partition_config)
63
- assert partitioner
test/unit/test_error.py DELETED
@@ -1,27 +0,0 @@
1
- import pytest
2
-
3
- from unstructured_ingest.error import (
4
- DestinationConnectionError,
5
- PartitionError,
6
- SourceConnectionError,
7
- )
8
-
9
-
10
- @pytest.mark.parametrize(
11
- ("error_class", "exception_type", "error_message"),
12
- [
13
- (SourceConnectionError, ValueError, "Simulated connection error"),
14
- (DestinationConnectionError, RuntimeError, "Simulated connection error"),
15
- (PartitionError, FileNotFoundError, "Simulated partition error"),
16
- ],
17
- )
18
- def test_custom_error_decorator(error_class, exception_type, error_message):
19
- @error_class.wrap
20
- def simulate_error():
21
- raise exception_type(error_message)
22
-
23
- with pytest.raises(error_class) as context:
24
- simulate_error()
25
-
26
- expected_error_string = error_class.error_string.format(error_message)
27
- assert str(context.value) == expected_error_string
test/unit/test_html.py DELETED
@@ -1,112 +0,0 @@
1
- import base64
2
- from pathlib import Path
3
-
4
- from bs4 import BeautifulSoup
5
- from pytest_mock import MockerFixture
6
-
7
- from unstructured_ingest.data_types.file_data import FileData, SourceIdentifiers
8
- from unstructured_ingest.utils.html import HtmlMixin
9
-
10
-
11
- def test_extract_images(mocker: MockerFixture):
12
- mixin = HtmlMixin(extract_images=True)
13
- mock_download_response = b"DOWNLOADED"
14
- expected_image_src = base64.b64encode(mock_download_response).decode()
15
- mocked_download_response = mocker.patch(
16
- "unstructured_ingest.utils.html.HtmlMixin.download_content",
17
- return_value=mock_download_response,
18
- )
19
- url = "http://mywebsite.com/path/to/page"
20
- html = """
21
- <img src="http://mywebsite.com/img1.jpg"/>
22
- <img src="http://notmywebsite.com/img2.jpg"/>
23
- <img src="img3.jpg"/>
24
- <img src="..."/>
25
- """
26
- expected_html = f"""
27
- <img src="data:image/png;base64,{expected_image_src}"/>
28
- <img src="http://notmywebsite.com/img2.jpg"/>
29
- <img src="data:image/png;base64,{expected_image_src}"/>
30
- <img src="..."/>
31
- """
32
- expected_soup = BeautifulSoup(expected_html, "html.parser")
33
- result = mixin.extract_html_images(url=url, html=html)
34
- result_soup = BeautifulSoup(result, "html.parser")
35
- assert expected_soup == result_soup
36
- assert mocked_download_response.call_count == 2
37
- urls_to_download = [
38
- call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
39
- ]
40
- assert urls_to_download == ["http://mywebsite.com/img1.jpg", "http://mywebsite.com/img3.jpg"]
41
-
42
-
43
- def test_extract_images_allow_list(mocker: MockerFixture):
44
- mixin = HtmlMixin(
45
- extract_images=True, allow_list=["http://allowedwebsite1.com", "http://allowedwebsite2.com"]
46
- )
47
- mock_download_response = b"DOWNLOADED"
48
- expected_image_src = base64.b64encode(mock_download_response).decode()
49
- mocked_download_response = mocker.patch(
50
- "unstructured_ingest.utils.html.HtmlMixin.download_content",
51
- return_value=mock_download_response,
52
- )
53
- url = "http://mywebsite.com/path/to/page"
54
- html = """
55
- <img src="http://mywebsite.com/img1.jpg"/>
56
- <img src="http://notmywebsite.com/img2.jpg"/>
57
- <img src="http://allowedwebsite1.com/img2.jpg"/>
58
- <img src="http://allowedwebsite2.com/img2.jpg"/>
59
- """
60
-
61
- expected_html = f"""
62
- <img src="http://mywebsite.com/img1.jpg"/>
63
- <img src="http://notmywebsite.com/img2.jpg"/>
64
- <img src="data:image/png;base64,{expected_image_src}"/>
65
- <img src="data:image/png;base64,{expected_image_src}"/>
66
- """
67
- expected_soup = BeautifulSoup(expected_html, "html.parser")
68
- result = mixin.extract_html_images(url=url, html=html)
69
- result_soup = BeautifulSoup(result, "html.parser")
70
- assert expected_soup == result_soup
71
- assert mocked_download_response.call_count == 2
72
- urls_to_download = [
73
- call_args_list.kwargs["url"] for call_args_list in mocked_download_response.call_args_list
74
- ]
75
- assert urls_to_download == [
76
- "http://allowedwebsite1.com/img2.jpg",
77
- "http://allowedwebsite2.com/img2.jpg",
78
- ]
79
-
80
-
81
- def test_extract_embedded_docs(mocker: MockerFixture):
82
- mixin = HtmlMixin(extract_files=True)
83
- mock_download_response = b"DOWNLOADED"
84
- mocked_download_response = mocker.patch(
85
- "unstructured_ingest.utils.html.HtmlMixin.download_content",
86
- return_value=mock_download_response,
87
- )
88
- mocked_write_content = mocker.patch("unstructured_ingest.utils.html.HtmlMixin.write_content")
89
- url = "http://mywebsite.com/path/to/page"
90
- html = """
91
- <a href="http://mywebsite.com/file.pdf"/>
92
- <a href="http://notmywebsite.com/file.pdf"/>
93
- <a href="http://mywebsite.com/another/link"/>
94
- <a href="another/link/2"/>
95
- <a href="file.doc"/>
96
- """
97
- file_data = FileData(
98
- source_identifiers=SourceIdentifiers(
99
- fullpath="file.txt",
100
- filename="file.txt",
101
- ),
102
- connector_type="my_connector",
103
- identifier="mock_file_data",
104
- )
105
- results = mixin.extract_embedded_files(
106
- url=url, html=html, download_dir=Path("/tmp/download/location"), original_filedata=file_data
107
- )
108
- assert len(results) == 2
109
- downloaded_urls = [r["file_data"].metadata.url for r in results]
110
- assert downloaded_urls == ["http://mywebsite.com/file.pdf", "http://mywebsite.com/file.doc"]
111
- assert mocked_download_response.call_count == 2
112
- assert mocked_write_content.call_count == 2
@@ -1,26 +0,0 @@
1
- import pytest
2
- from pydantic import Secret, ValidationError
3
-
4
- from unstructured_ingest.interfaces import AccessConfig, ConnectionConfig
5
-
6
-
7
- def test_failing_connection_config():
8
- class MyAccessConfig(AccessConfig):
9
- sensitive_value: str
10
-
11
- class MyConnectionConfig(ConnectionConfig):
12
- access_config: MyAccessConfig
13
-
14
- with pytest.raises(ValidationError):
15
- MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
16
-
17
-
18
- def test_happy_path_connection_config():
19
- class MyAccessConfig(AccessConfig):
20
- sensitive_value: str
21
-
22
- class MyConnectionConfig(ConnectionConfig):
23
- access_config: Secret[MyAccessConfig]
24
-
25
- connection_config = MyConnectionConfig(access_config=MyAccessConfig(sensitive_value="this"))
26
- assert connection_config
test/unit/test_logger.py DELETED
@@ -1,78 +0,0 @@
1
- import json
2
-
3
- import pytest
4
-
5
- from unstructured_ingest.logger import (
6
- default_is_data_sensitive,
7
- hide_sensitive_fields,
8
- redact_jsons,
9
- )
10
-
11
-
12
- @pytest.mark.parametrize(
13
- ("key", "value", "is_sensitive"),
14
- [
15
- ("username", "john_smith", False),
16
- ("password", "13?H%", True),
17
- ("token", "123", True),
18
- ("AWS_CREDENTIAL", "aws_credential", True),
19
- ("AWS_KEY", None, False),
20
- ],
21
- )
22
- def test_default_is_sensitive(key, value, is_sensitive):
23
- assert default_is_data_sensitive(key, value) == is_sensitive
24
-
25
-
26
- def test_hide_sensitive_fields():
27
- d = {
28
- "username": "john_smith",
29
- "password": "13?H%",
30
- "inner": {
31
- "token": "123",
32
- "AWS_KEY": None,
33
- "inner_j_string": json.dumps(
34
- {"account_name": "secret name", "client_id": 123, "timestamp": 123}
35
- ),
36
- },
37
- }
38
- redacted_d = hide_sensitive_fields(d)
39
- expected_d = {
40
- "password": "*******",
41
- "username": "john_smith",
42
- "inner": {
43
- "token": "*******",
44
- "AWS_KEY": None,
45
- "inner_j_string": json.dumps(
46
- {"account_name": "*******", "client_id": "*******", "timestamp": 123}
47
- ),
48
- },
49
- }
50
- assert redacted_d == expected_d
51
-
52
-
53
- def test_redact_jsons():
54
- d1 = {
55
- "username": "john_smith",
56
- "password": "13?H%",
57
- "inner": {
58
- "token": "123",
59
- "AWS_KEY": None,
60
- "inner_j_string": json.dumps(
61
- {"account_name": "secret name", "client_id": 123, "timestamp": 123}
62
- ),
63
- },
64
- }
65
-
66
- d2 = {"username": "tim67", "update_time": 456}
67
- d3 = {"account_name": "top secret", "host": "http://localhost:8888"}
68
-
69
- sensitive_string = f"Some topic secret info ({json.dumps(d1)} regarding {d2} and {d3})"
70
- expected_string = (
71
- 'Some topic secret info ({"username": "john_smith", "password": "*******", '
72
- '"inner": {"token": "*******", "AWS_KEY": null, "inner_j_string": '
73
- '"{\\"account_name\\": \\"*******\\", \\"client_id\\": \\"*******\\", '
74
- '\\"timestamp\\": 123}"}} regarding {"username": "tim67", "update_time": 456} '
75
- 'and {"account_name": "*******", "host": "http://localhost:8888"})'
76
- )
77
- redacted_string = redact_jsons(sensitive_string)
78
- assert redacted_string == expected_string