unstructured-ingest 0.7.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (192) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/logger.py +2 -93
  7. unstructured_ingest/main.py +0 -0
  8. unstructured_ingest/pipeline/interfaces.py +1 -1
  9. unstructured_ingest/pipeline/pipeline.py +1 -1
  10. unstructured_ingest/processes/chunker.py +4 -0
  11. unstructured_ingest/processes/connectors/airtable.py +4 -2
  12. unstructured_ingest/processes/connectors/assets/databricks_delta_table_schema.sql +10 -0
  13. unstructured_ingest/processes/connectors/assets/weaviate_collection_config.json +23 -0
  14. unstructured_ingest/processes/connectors/astradb.py +2 -2
  15. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  16. unstructured_ingest/processes/connectors/confluence.py +0 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  18. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  19. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  20. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  21. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  22. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  23. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  24. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  25. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  26. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  27. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  28. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  29. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  30. unstructured_ingest/processes/connectors/outlook.py +1 -2
  31. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  32. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  33. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  34. unstructured_ingest/processes/connectors/slack.py +1 -2
  35. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  37. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  38. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  39. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  40. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  41. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  42. unstructured_ingest/processes/connectors/vectara.py +0 -2
  43. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  44. unstructured_ingest/processes/embedder.py +2 -2
  45. unstructured_ingest/processes/filter.py +1 -1
  46. unstructured_ingest/processes/partitioner.py +4 -0
  47. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  48. unstructured_ingest/unstructured_api.py +13 -8
  49. unstructured_ingest/utils/data_prep.py +8 -32
  50. unstructured_ingest/utils/string_and_date_utils.py +3 -3
  51. unstructured_ingest-1.0.1.dist-info/METADATA +226 -0
  52. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/RECORD +54 -187
  53. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/WHEEL +1 -2
  54. examples/__init__.py +0 -0
  55. examples/airtable.py +0 -44
  56. examples/azure_cognitive_search.py +0 -55
  57. examples/chroma.py +0 -54
  58. examples/couchbase.py +0 -55
  59. examples/databricks_volumes_dest.py +0 -55
  60. examples/databricks_volumes_source.py +0 -53
  61. examples/delta_table.py +0 -45
  62. examples/discord_example.py +0 -36
  63. examples/elasticsearch.py +0 -49
  64. examples/google_drive.py +0 -45
  65. examples/kdbai.py +0 -54
  66. examples/local.py +0 -36
  67. examples/milvus.py +0 -44
  68. examples/mongodb.py +0 -53
  69. examples/opensearch.py +0 -50
  70. examples/pinecone.py +0 -57
  71. examples/s3.py +0 -38
  72. examples/salesforce.py +0 -44
  73. examples/sharepoint.py +0 -47
  74. examples/singlestore.py +0 -49
  75. examples/sql.py +0 -90
  76. examples/vectara.py +0 -54
  77. examples/weaviate.py +0 -44
  78. test/__init__.py +0 -0
  79. test/integration/__init__.py +0 -0
  80. test/integration/chunkers/__init__.py +0 -0
  81. test/integration/chunkers/test_chunkers.py +0 -31
  82. test/integration/connectors/__init__.py +0 -0
  83. test/integration/connectors/conftest.py +0 -38
  84. test/integration/connectors/databricks/__init__.py +0 -0
  85. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  86. test/integration/connectors/discord/__init__.py +0 -0
  87. test/integration/connectors/discord/test_discord.py +0 -90
  88. test/integration/connectors/duckdb/__init__.py +0 -0
  89. test/integration/connectors/duckdb/conftest.py +0 -14
  90. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  91. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  92. test/integration/connectors/elasticsearch/__init__.py +0 -0
  93. test/integration/connectors/elasticsearch/conftest.py +0 -34
  94. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  95. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  96. test/integration/connectors/sql/__init__.py +0 -0
  97. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  98. test/integration/connectors/sql/test_postgres.py +0 -201
  99. test/integration/connectors/sql/test_singlestore.py +0 -182
  100. test/integration/connectors/sql/test_snowflake.py +0 -244
  101. test/integration/connectors/sql/test_sqlite.py +0 -168
  102. test/integration/connectors/sql/test_vastdb.py +0 -34
  103. test/integration/connectors/test_astradb.py +0 -287
  104. test/integration/connectors/test_azure_ai_search.py +0 -254
  105. test/integration/connectors/test_chroma.py +0 -136
  106. test/integration/connectors/test_confluence.py +0 -111
  107. test/integration/connectors/test_delta_table.py +0 -183
  108. test/integration/connectors/test_dropbox.py +0 -151
  109. test/integration/connectors/test_github.py +0 -49
  110. test/integration/connectors/test_google_drive.py +0 -257
  111. test/integration/connectors/test_jira.py +0 -67
  112. test/integration/connectors/test_lancedb.py +0 -247
  113. test/integration/connectors/test_milvus.py +0 -208
  114. test/integration/connectors/test_mongodb.py +0 -335
  115. test/integration/connectors/test_neo4j.py +0 -244
  116. test/integration/connectors/test_notion.py +0 -152
  117. test/integration/connectors/test_onedrive.py +0 -163
  118. test/integration/connectors/test_pinecone.py +0 -387
  119. test/integration/connectors/test_qdrant.py +0 -216
  120. test/integration/connectors/test_redis.py +0 -143
  121. test/integration/connectors/test_s3.py +0 -184
  122. test/integration/connectors/test_sharepoint.py +0 -222
  123. test/integration/connectors/test_vectara.py +0 -282
  124. test/integration/connectors/test_zendesk.py +0 -120
  125. test/integration/connectors/utils/__init__.py +0 -0
  126. test/integration/connectors/utils/constants.py +0 -13
  127. test/integration/connectors/utils/docker.py +0 -151
  128. test/integration/connectors/utils/docker_compose.py +0 -59
  129. test/integration/connectors/utils/validation/__init__.py +0 -0
  130. test/integration/connectors/utils/validation/destination.py +0 -77
  131. test/integration/connectors/utils/validation/equality.py +0 -76
  132. test/integration/connectors/utils/validation/source.py +0 -331
  133. test/integration/connectors/utils/validation/utils.py +0 -36
  134. test/integration/connectors/weaviate/__init__.py +0 -0
  135. test/integration/connectors/weaviate/conftest.py +0 -15
  136. test/integration/connectors/weaviate/test_cloud.py +0 -39
  137. test/integration/connectors/weaviate/test_local.py +0 -152
  138. test/integration/embedders/__init__.py +0 -0
  139. test/integration/embedders/conftest.py +0 -13
  140. test/integration/embedders/test_azure_openai.py +0 -57
  141. test/integration/embedders/test_bedrock.py +0 -103
  142. test/integration/embedders/test_huggingface.py +0 -24
  143. test/integration/embedders/test_mixedbread.py +0 -71
  144. test/integration/embedders/test_octoai.py +0 -75
  145. test/integration/embedders/test_openai.py +0 -74
  146. test/integration/embedders/test_togetherai.py +0 -71
  147. test/integration/embedders/test_vertexai.py +0 -63
  148. test/integration/embedders/test_voyageai.py +0 -79
  149. test/integration/embedders/utils.py +0 -66
  150. test/integration/partitioners/__init__.py +0 -0
  151. test/integration/partitioners/test_partitioner.py +0 -76
  152. test/integration/utils.py +0 -15
  153. test/unit/__init__.py +0 -0
  154. test/unit/chunkers/__init__.py +0 -0
  155. test/unit/chunkers/test_chunkers.py +0 -49
  156. test/unit/connectors/__init__.py +0 -0
  157. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  158. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  159. test/unit/connectors/motherduck/__init__.py +0 -0
  160. test/unit/connectors/motherduck/test_base.py +0 -73
  161. test/unit/connectors/sql/__init__.py +0 -0
  162. test/unit/connectors/sql/test_sql.py +0 -152
  163. test/unit/connectors/test_confluence.py +0 -71
  164. test/unit/connectors/test_jira.py +0 -401
  165. test/unit/embed/__init__.py +0 -0
  166. test/unit/embed/test_mixedbreadai.py +0 -42
  167. test/unit/embed/test_octoai.py +0 -27
  168. test/unit/embed/test_openai.py +0 -28
  169. test/unit/embed/test_vertexai.py +0 -25
  170. test/unit/embed/test_voyageai.py +0 -24
  171. test/unit/embedders/__init__.py +0 -0
  172. test/unit/embedders/test_bedrock.py +0 -36
  173. test/unit/embedders/test_huggingface.py +0 -48
  174. test/unit/embedders/test_mixedbread.py +0 -37
  175. test/unit/embedders/test_octoai.py +0 -35
  176. test/unit/embedders/test_openai.py +0 -35
  177. test/unit/embedders/test_togetherai.py +0 -37
  178. test/unit/embedders/test_vertexai.py +0 -37
  179. test/unit/embedders/test_voyageai.py +0 -38
  180. test/unit/partitioners/__init__.py +0 -0
  181. test/unit/partitioners/test_partitioner.py +0 -63
  182. test/unit/test_error.py +0 -27
  183. test/unit/test_html.py +0 -112
  184. test/unit/test_interfaces.py +0 -26
  185. test/unit/test_logger.py +0 -78
  186. test/unit/test_utils.py +0 -220
  187. test/unit/utils/__init__.py +0 -0
  188. test/unit/utils/data_generator.py +0 -32
  189. unstructured_ingest-0.7.1.dist-info/METADATA +0 -383
  190. unstructured_ingest-0.7.1.dist-info/top_level.txt +0 -3
  191. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info}/entry_points.txt +0 -0
  192. {unstructured_ingest-0.7.1.dist-info → unstructured_ingest-1.0.1.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,63 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
-
7
- from test.integration.embedders.utils import (
8
- validate_embedding_output,
9
- validate_raw_embedder,
10
- validate_raw_embedder_async,
11
- )
12
- from test.integration.utils import requires_env
13
- from unstructured_ingest.embed.vertexai import (
14
- AsyncVertexAIEmbeddingEncoder,
15
- VertexAIEmbeddingConfig,
16
- VertexAIEmbeddingEncoder,
17
- )
18
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
19
-
20
- API_KEY = "VERTEXAI_API_KEY"
21
-
22
-
23
- def get_api_key() -> str:
24
- api_key = os.getenv(API_KEY, None)
25
- assert api_key
26
- return api_key
27
-
28
-
29
- @requires_env(API_KEY)
30
- def test_vertexai_embedder(embedder_file: Path):
31
- api_key = get_api_key()
32
- embedder_config = EmbedderConfig(embedding_provider="vertexai", embedding_api_key=api_key)
33
- embedder = Embedder(config=embedder_config)
34
- results = embedder.run(elements_filepath=embedder_file)
35
- assert results
36
- with embedder_file.open("r") as f:
37
- original_elements = json.load(f)
38
- validate_embedding_output(original_elements=original_elements, output_elements=results)
39
-
40
-
41
- @requires_env(API_KEY)
42
- def test_raw_vertexai_embedder(embedder_file: Path):
43
- api_key = get_api_key()
44
- embedder = VertexAIEmbeddingEncoder(
45
- config=VertexAIEmbeddingConfig(
46
- api_key=api_key,
47
- )
48
- )
49
- validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=768)
50
-
51
-
52
- @requires_env(API_KEY)
53
- @pytest.mark.asyncio
54
- async def test_raw_async_vertexai_embedder(embedder_file: Path):
55
- api_key = get_api_key()
56
- embedder = AsyncVertexAIEmbeddingEncoder(
57
- config=VertexAIEmbeddingConfig(
58
- api_key=api_key,
59
- )
60
- )
61
- await validate_raw_embedder_async(
62
- embedder=embedder, embedder_file=embedder_file, expected_dimension=768
63
- )
@@ -1,79 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
-
7
- from test.integration.embedders.utils import (
8
- validate_embedding_output,
9
- validate_raw_embedder,
10
- validate_raw_embedder_async,
11
- )
12
- from test.integration.utils import requires_env
13
- from unstructured_ingest.embed.voyageai import (
14
- AsyncVoyageAIEmbeddingEncoder,
15
- VoyageAIEmbeddingConfig,
16
- VoyageAIEmbeddingEncoder,
17
- )
18
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
19
-
20
- API_KEY = "VOYAGEAI_API_KEY"
21
-
22
-
23
- def get_api_key() -> str:
24
- api_key = os.getenv(API_KEY, None)
25
- assert api_key
26
- return api_key
27
-
28
-
29
- @requires_env(API_KEY)
30
- def test_voyageai_embedder(embedder_file: Path):
31
- api_key = get_api_key()
32
- embedder_config = EmbedderConfig(embedding_provider="voyageai", embedding_api_key=api_key)
33
- embedder = Embedder(config=embedder_config)
34
- results = embedder.run(elements_filepath=embedder_file)
35
- assert results
36
- with embedder_file.open("r") as f:
37
- original_elements = json.load(f)
38
- validate_embedding_output(original_elements=original_elements, output_elements=results)
39
-
40
-
41
- @requires_env(API_KEY)
42
- def test_raw_voyageai_embedder(embedder_file: Path):
43
- api_key = get_api_key()
44
- embedder = VoyageAIEmbeddingEncoder(
45
- config=VoyageAIEmbeddingConfig(
46
- api_key=api_key,
47
- )
48
- )
49
- validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
50
-
51
-
52
- @requires_env(API_KEY)
53
- @pytest.mark.asyncio
54
- async def test_raw_async_voyageai_embedder(embedder_file: Path):
55
- api_key = get_api_key()
56
- embedder = AsyncVoyageAIEmbeddingEncoder(
57
- config=VoyageAIEmbeddingConfig(
58
- api_key=api_key,
59
- )
60
- )
61
- await validate_raw_embedder_async(
62
- embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
63
- )
64
-
65
-
66
- @requires_env(API_KEY)
67
- def test_voyageai_multimodal_embedder(embedder_file: Path):
68
- api_key = get_api_key()
69
- embedder_config = EmbedderConfig(
70
- embedding_provider="voyageai",
71
- embedding_api_key=api_key,
72
- embedding_model_name="voyage-multimodal-3",
73
- )
74
- embedder = Embedder(config=embedder_config)
75
- results = embedder.run(elements_filepath=embedder_file)
76
- assert results
77
- with embedder_file.open("r") as f:
78
- original_elements = json.load(f)
79
- validate_embedding_output(original_elements=original_elements, output_elements=results)
@@ -1,66 +0,0 @@
1
- import json
2
- from pathlib import Path
3
- from typing import Optional
4
-
5
- from unstructured_ingest.embed.interfaces import AsyncBaseEmbeddingEncoder, BaseEmbeddingEncoder
6
-
7
-
8
- def validate_embedding_output(original_elements: list[dict], output_elements: list[dict]):
9
- """
10
- Make sure the following characteristics are met:
11
- * The same number of elements are returned
12
- * For each element that had text, an embeddings entry was added in the output
13
- * Other than the embedding, nothing about the element was changed
14
- """
15
- assert len(original_elements) == len(output_elements)
16
- for original_element, output_element in zip(original_elements, output_elements):
17
- if original_element.get("text"):
18
- assert output_element.get("embeddings", None)
19
- output_element.pop("embeddings", None)
20
- assert original_element == output_element
21
-
22
-
23
- def validate_raw_embedder(
24
- embedder: BaseEmbeddingEncoder,
25
- embedder_file: Path,
26
- expected_dimension: Optional[int] = None,
27
- expected_is_unit_vector: bool = True,
28
- ):
29
- with open(embedder_file) as f:
30
- elements = json.load(f)
31
- all_text = [element["text"] for element in elements]
32
- single_text = all_text[0]
33
- dimension = embedder.dimension
34
- if expected_dimension:
35
- assert (
36
- dimension == expected_dimension
37
- ), f"dimensions {dimension} didn't match expected: {expected_dimension}"
38
- is_unit_vector = embedder.is_unit_vector
39
- assert is_unit_vector == expected_is_unit_vector
40
- single_embedding = embedder.embed_query(query=single_text)
41
- assert len(single_embedding) == dimension
42
- embedded_elements = embedder.embed_documents(elements=elements)
43
- validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
44
-
45
-
46
- async def validate_raw_embedder_async(
47
- embedder: AsyncBaseEmbeddingEncoder,
48
- embedder_file: Path,
49
- expected_dimension: Optional[int] = None,
50
- expected_is_unit_vector: bool = True,
51
- ):
52
- with open(embedder_file) as f:
53
- elements = json.load(f)
54
- all_text = [element["text"] for element in elements]
55
- single_text = all_text[0]
56
- dimension = await embedder.dimension
57
- if expected_dimension:
58
- assert (
59
- dimension == expected_dimension
60
- ), f"dimension {dimension} didn't match expected: {expected_dimension}"
61
- is_unit_vector = await embedder.is_unit_vector
62
- assert is_unit_vector == expected_is_unit_vector
63
- single_embedding = await embedder.embed_query(query=single_text)
64
- assert len(single_embedding) == dimension
65
- embedded_elements = await embedder.embed_documents(elements=elements)
66
- validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
File without changes
@@ -1,76 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- import pytest
5
-
6
- from test.integration.utils import requires_env
7
- from unstructured_ingest.errors_v2 import UserError
8
- from unstructured_ingest.processes.partitioner import Partitioner, PartitionerConfig
9
-
10
- int_test_dir = Path(__file__).parent
11
- assets_dir = int_test_dir / "assets"
12
-
13
- all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
14
- non_image_partition_files = [
15
- path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
16
- ]
17
- supported_fast_partition_files = [
18
- path for path in non_image_partition_files if path.suffix != ".eml"
19
- ]
20
- image_partition_files = [
21
- path for path in all_partition_files if path not in non_image_partition_files
22
- ]
23
-
24
-
25
- @pytest.mark.parametrize(
26
- "partition_file", all_partition_files, ids=[path.name for path in all_partition_files]
27
- )
28
- @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
29
- @pytest.mark.asyncio
30
- async def test_partitioner_api_hi_res(partition_file: Path):
31
- api_key = os.getenv("UNSTRUCTURED_API_KEY")
32
- api_url = os.getenv("UNSTRUCTURED_API_URL")
33
- partitioner_config = PartitionerConfig(
34
- strategy="hi_res", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
35
- )
36
- partitioner = Partitioner(config=partitioner_config)
37
- results = await partitioner.run_async(filename=partition_file)
38
- assert results
39
-
40
-
41
- @pytest.mark.parametrize(
42
- "partition_file",
43
- supported_fast_partition_files,
44
- ids=[path.name for path in supported_fast_partition_files],
45
- )
46
- @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
47
- @pytest.mark.asyncio
48
- async def test_partitioner_api_fast(partition_file: Path):
49
- api_key = os.getenv("UNSTRUCTURED_API_KEY")
50
- api_url = os.getenv("UNSTRUCTURED_API_URL")
51
- partitioner_config = PartitionerConfig(
52
- strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
53
- )
54
- partitioner = Partitioner(config=partitioner_config)
55
- results = await partitioner.run_async(filename=partition_file)
56
- assert results
57
-
58
-
59
- @pytest.mark.parametrize(
60
- "partition_file", image_partition_files, ids=[path.name for path in image_partition_files]
61
- )
62
- @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
63
- @pytest.mark.asyncio
64
- async def test_partitioner_api_fast_error(partition_file: Path):
65
- api_key = os.getenv("UNSTRUCTURED_API_KEY")
66
- api_url = os.getenv("UNSTRUCTURED_API_URL")
67
- partitioner_config = PartitionerConfig(
68
- strategy="fast",
69
- partition_by_api=True,
70
- api_key=api_key,
71
- partition_endpoint=api_url,
72
- raise_unsupported_filetype=True,
73
- )
74
- partitioner = Partitioner(config=partitioner_config)
75
- with pytest.raises(UserError):
76
- await partitioner.run_async(filename=partition_file)
test/integration/utils.py DELETED
@@ -1,15 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
-
6
- def requires_env(*envs):
7
- if len(envs) == 1:
8
- env = envs[0]
9
- return pytest.mark.skipif(
10
- env not in os.environ, reason=f"Environment variable not set: {env}"
11
- )
12
- return pytest.mark.skipif(
13
- not all(env in os.environ for env in envs),
14
- reason="All required environment variables not set: {}".format(", ".join(envs)),
15
- )
test/unit/__init__.py DELETED
File without changes
File without changes
@@ -1,49 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
7
-
8
- fake = faker.Faker()
9
-
10
-
11
- def generate_chunker_config_params() -> dict:
12
- params = {}
13
- random_val = random.random()
14
- if random_val < 0.5:
15
- params["chunking_strategy"] = fake.word() if random.random() < 0.5 else None
16
- params["chunk_combine_text_under_n_chars"] = (
17
- fake.random_int() if random.random() < 0.5 else None
18
- )
19
- params["chunk_include_orig_elements"] = fake.boolean() if random.random() < 0.5 else None
20
- params["chunk_max_characters"] = fake.random_int()
21
- params["chunk_multipage_sections"] = fake.boolean()
22
- params["chunk_new_after_n_chars"] = fake.random_int() if random.random() < 0.5 else None
23
- params["chunk_overlap"] = fake.random_int() if random.random() < 0.5 else None
24
- params["chunk_overlap_all"] = fake.boolean() if random.random() < 0.5 else None
25
- if random_val < 0.5:
26
- params["chunk_by_api"] = True
27
- params["chunking_endpoint"] = fake.url()
28
- params["chunk_api_key"] = fake.password()
29
- else:
30
- params["chunk_by_api"] = False
31
-
32
- return params
33
-
34
-
35
- @pytest.mark.parametrize(
36
- "partition_config_params", [generate_chunker_config_params() for i in range(10)]
37
- )
38
- def test_chunker_config(partition_config_params: dict):
39
- chunker_config = ChunkerConfig.model_validate(partition_config_params)
40
- assert chunker_config
41
-
42
-
43
- @pytest.mark.parametrize(
44
- "partition_config_params", [generate_chunker_config_params() for i in range(10)]
45
- )
46
- def test_chunker(partition_config_params: dict):
47
- chunker_config = ChunkerConfig.model_validate(partition_config_params)
48
- chunker = Chunker(config=chunker_config)
49
- assert chunker
File without changes
File without changes