unstructured-ingest 0.7.2__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (187) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/README.md +28 -0
  3. unstructured_ingest/embed/mixedbreadai.py +0 -1
  4. unstructured_ingest/interfaces/upload_stager.py +2 -2
  5. unstructured_ingest/interfaces/uploader.py +3 -3
  6. unstructured_ingest/main.py +0 -0
  7. unstructured_ingest/pipeline/interfaces.py +1 -1
  8. unstructured_ingest/pipeline/pipeline.py +1 -1
  9. unstructured_ingest/processes/chunker.py +4 -0
  10. unstructured_ingest/processes/connectors/airtable.py +4 -2
  11. unstructured_ingest/processes/connectors/astradb.py +48 -34
  12. unstructured_ingest/processes/connectors/azure_ai_search.py +1 -1
  13. unstructured_ingest/processes/connectors/confluence.py +0 -1
  14. unstructured_ingest/processes/connectors/databricks/volumes_aws.py +1 -1
  15. unstructured_ingest/processes/connectors/databricks/volumes_azure.py +2 -2
  16. unstructured_ingest/processes/connectors/databricks/volumes_gcp.py +1 -1
  17. unstructured_ingest/processes/connectors/databricks/volumes_table.py +1 -2
  18. unstructured_ingest/processes/connectors/delta_table.py +1 -0
  19. unstructured_ingest/processes/connectors/duckdb/base.py +2 -2
  20. unstructured_ingest/processes/connectors/duckdb/duckdb.py +3 -3
  21. unstructured_ingest/processes/connectors/duckdb/motherduck.py +3 -3
  22. unstructured_ingest/processes/connectors/fsspec/s3.py +5 -3
  23. unstructured_ingest/processes/connectors/gitlab.py +1 -2
  24. unstructured_ingest/processes/connectors/google_drive.py +0 -2
  25. unstructured_ingest/processes/connectors/ibm_watsonx/ibm_watsonx_s3.py +9 -7
  26. unstructured_ingest/processes/connectors/kdbai.py +1 -0
  27. unstructured_ingest/processes/connectors/outlook.py +1 -2
  28. unstructured_ingest/processes/connectors/pinecone.py +0 -1
  29. unstructured_ingest/processes/connectors/redisdb.py +28 -24
  30. unstructured_ingest/processes/connectors/salesforce.py +1 -1
  31. unstructured_ingest/processes/connectors/slack.py +1 -2
  32. unstructured_ingest/processes/connectors/sql/databricks_delta_tables.py +5 -0
  33. unstructured_ingest/processes/connectors/sql/postgres.py +7 -1
  34. unstructured_ingest/processes/connectors/sql/singlestore.py +11 -6
  35. unstructured_ingest/processes/connectors/sql/snowflake.py +5 -0
  36. unstructured_ingest/processes/connectors/sql/sql.py +3 -4
  37. unstructured_ingest/processes/connectors/sql/sqlite.py +5 -0
  38. unstructured_ingest/processes/connectors/sql/vastdb.py +7 -3
  39. unstructured_ingest/processes/connectors/vectara.py +0 -2
  40. unstructured_ingest/processes/connectors/zendesk/zendesk.py +0 -2
  41. unstructured_ingest/processes/embedder.py +2 -2
  42. unstructured_ingest/processes/filter.py +1 -1
  43. unstructured_ingest/processes/partitioner.py +4 -0
  44. unstructured_ingest/processes/utils/blob_storage.py +2 -2
  45. unstructured_ingest/unstructured_api.py +13 -8
  46. unstructured_ingest/utils/data_prep.py +8 -32
  47. unstructured_ingest-1.0.2.dist-info/METADATA +226 -0
  48. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/RECORD +50 -184
  49. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/WHEEL +1 -2
  50. examples/__init__.py +0 -0
  51. examples/airtable.py +0 -44
  52. examples/azure_cognitive_search.py +0 -55
  53. examples/chroma.py +0 -54
  54. examples/couchbase.py +0 -55
  55. examples/databricks_volumes_dest.py +0 -55
  56. examples/databricks_volumes_source.py +0 -53
  57. examples/delta_table.py +0 -45
  58. examples/discord_example.py +0 -36
  59. examples/elasticsearch.py +0 -49
  60. examples/google_drive.py +0 -45
  61. examples/kdbai.py +0 -54
  62. examples/local.py +0 -36
  63. examples/milvus.py +0 -44
  64. examples/mongodb.py +0 -53
  65. examples/opensearch.py +0 -50
  66. examples/pinecone.py +0 -57
  67. examples/s3.py +0 -38
  68. examples/salesforce.py +0 -44
  69. examples/sharepoint.py +0 -47
  70. examples/singlestore.py +0 -49
  71. examples/sql.py +0 -90
  72. examples/vectara.py +0 -54
  73. examples/weaviate.py +0 -44
  74. test/__init__.py +0 -0
  75. test/integration/__init__.py +0 -0
  76. test/integration/chunkers/__init__.py +0 -0
  77. test/integration/chunkers/test_chunkers.py +0 -31
  78. test/integration/connectors/__init__.py +0 -0
  79. test/integration/connectors/conftest.py +0 -38
  80. test/integration/connectors/databricks/__init__.py +0 -0
  81. test/integration/connectors/databricks/test_volumes_native.py +0 -273
  82. test/integration/connectors/discord/__init__.py +0 -0
  83. test/integration/connectors/discord/test_discord.py +0 -90
  84. test/integration/connectors/duckdb/__init__.py +0 -0
  85. test/integration/connectors/duckdb/conftest.py +0 -14
  86. test/integration/connectors/duckdb/test_duckdb.py +0 -90
  87. test/integration/connectors/duckdb/test_motherduck.py +0 -95
  88. test/integration/connectors/elasticsearch/__init__.py +0 -0
  89. test/integration/connectors/elasticsearch/conftest.py +0 -34
  90. test/integration/connectors/elasticsearch/test_elasticsearch.py +0 -331
  91. test/integration/connectors/elasticsearch/test_opensearch.py +0 -326
  92. test/integration/connectors/sql/__init__.py +0 -0
  93. test/integration/connectors/sql/test_databricks_delta_tables.py +0 -170
  94. test/integration/connectors/sql/test_postgres.py +0 -201
  95. test/integration/connectors/sql/test_singlestore.py +0 -182
  96. test/integration/connectors/sql/test_snowflake.py +0 -244
  97. test/integration/connectors/sql/test_sqlite.py +0 -168
  98. test/integration/connectors/sql/test_vastdb.py +0 -34
  99. test/integration/connectors/test_astradb.py +0 -287
  100. test/integration/connectors/test_azure_ai_search.py +0 -254
  101. test/integration/connectors/test_chroma.py +0 -136
  102. test/integration/connectors/test_confluence.py +0 -111
  103. test/integration/connectors/test_delta_table.py +0 -183
  104. test/integration/connectors/test_dropbox.py +0 -151
  105. test/integration/connectors/test_github.py +0 -49
  106. test/integration/connectors/test_google_drive.py +0 -257
  107. test/integration/connectors/test_jira.py +0 -67
  108. test/integration/connectors/test_lancedb.py +0 -247
  109. test/integration/connectors/test_milvus.py +0 -208
  110. test/integration/connectors/test_mongodb.py +0 -335
  111. test/integration/connectors/test_neo4j.py +0 -244
  112. test/integration/connectors/test_notion.py +0 -152
  113. test/integration/connectors/test_onedrive.py +0 -163
  114. test/integration/connectors/test_pinecone.py +0 -387
  115. test/integration/connectors/test_qdrant.py +0 -216
  116. test/integration/connectors/test_redis.py +0 -143
  117. test/integration/connectors/test_s3.py +0 -184
  118. test/integration/connectors/test_sharepoint.py +0 -222
  119. test/integration/connectors/test_vectara.py +0 -282
  120. test/integration/connectors/test_zendesk.py +0 -120
  121. test/integration/connectors/utils/__init__.py +0 -0
  122. test/integration/connectors/utils/constants.py +0 -13
  123. test/integration/connectors/utils/docker.py +0 -151
  124. test/integration/connectors/utils/docker_compose.py +0 -59
  125. test/integration/connectors/utils/validation/__init__.py +0 -0
  126. test/integration/connectors/utils/validation/destination.py +0 -77
  127. test/integration/connectors/utils/validation/equality.py +0 -76
  128. test/integration/connectors/utils/validation/source.py +0 -331
  129. test/integration/connectors/utils/validation/utils.py +0 -36
  130. test/integration/connectors/weaviate/__init__.py +0 -0
  131. test/integration/connectors/weaviate/conftest.py +0 -15
  132. test/integration/connectors/weaviate/test_cloud.py +0 -39
  133. test/integration/connectors/weaviate/test_local.py +0 -152
  134. test/integration/embedders/__init__.py +0 -0
  135. test/integration/embedders/conftest.py +0 -13
  136. test/integration/embedders/test_azure_openai.py +0 -57
  137. test/integration/embedders/test_bedrock.py +0 -103
  138. test/integration/embedders/test_huggingface.py +0 -24
  139. test/integration/embedders/test_mixedbread.py +0 -71
  140. test/integration/embedders/test_octoai.py +0 -75
  141. test/integration/embedders/test_openai.py +0 -74
  142. test/integration/embedders/test_togetherai.py +0 -71
  143. test/integration/embedders/test_vertexai.py +0 -63
  144. test/integration/embedders/test_voyageai.py +0 -79
  145. test/integration/embedders/utils.py +0 -66
  146. test/integration/partitioners/__init__.py +0 -0
  147. test/integration/partitioners/test_partitioner.py +0 -76
  148. test/integration/utils.py +0 -15
  149. test/unit/__init__.py +0 -0
  150. test/unit/chunkers/__init__.py +0 -0
  151. test/unit/chunkers/test_chunkers.py +0 -49
  152. test/unit/connectors/__init__.py +0 -0
  153. test/unit/connectors/ibm_watsonx/__init__.py +0 -0
  154. test/unit/connectors/ibm_watsonx/test_ibm_watsonx_s3.py +0 -459
  155. test/unit/connectors/motherduck/__init__.py +0 -0
  156. test/unit/connectors/motherduck/test_base.py +0 -73
  157. test/unit/connectors/sql/__init__.py +0 -0
  158. test/unit/connectors/sql/test_sql.py +0 -152
  159. test/unit/connectors/test_confluence.py +0 -71
  160. test/unit/connectors/test_jira.py +0 -401
  161. test/unit/embed/__init__.py +0 -0
  162. test/unit/embed/test_mixedbreadai.py +0 -42
  163. test/unit/embed/test_octoai.py +0 -27
  164. test/unit/embed/test_openai.py +0 -28
  165. test/unit/embed/test_vertexai.py +0 -25
  166. test/unit/embed/test_voyageai.py +0 -24
  167. test/unit/embedders/__init__.py +0 -0
  168. test/unit/embedders/test_bedrock.py +0 -36
  169. test/unit/embedders/test_huggingface.py +0 -48
  170. test/unit/embedders/test_mixedbread.py +0 -37
  171. test/unit/embedders/test_octoai.py +0 -35
  172. test/unit/embedders/test_openai.py +0 -35
  173. test/unit/embedders/test_togetherai.py +0 -37
  174. test/unit/embedders/test_vertexai.py +0 -37
  175. test/unit/embedders/test_voyageai.py +0 -38
  176. test/unit/partitioners/__init__.py +0 -0
  177. test/unit/partitioners/test_partitioner.py +0 -63
  178. test/unit/test_error.py +0 -27
  179. test/unit/test_html.py +0 -112
  180. test/unit/test_interfaces.py +0 -26
  181. test/unit/test_utils.py +0 -220
  182. test/unit/utils/__init__.py +0 -0
  183. test/unit/utils/data_generator.py +0 -32
  184. unstructured_ingest-0.7.2.dist-info/METADATA +0 -383
  185. unstructured_ingest-0.7.2.dist-info/top_level.txt +0 -3
  186. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info}/entry_points.txt +0 -0
  187. {unstructured_ingest-0.7.2.dist-info → unstructured_ingest-1.0.2.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,63 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
-
7
- from test.integration.embedders.utils import (
8
- validate_embedding_output,
9
- validate_raw_embedder,
10
- validate_raw_embedder_async,
11
- )
12
- from test.integration.utils import requires_env
13
- from unstructured_ingest.embed.vertexai import (
14
- AsyncVertexAIEmbeddingEncoder,
15
- VertexAIEmbeddingConfig,
16
- VertexAIEmbeddingEncoder,
17
- )
18
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
19
-
20
- API_KEY = "VERTEXAI_API_KEY"
21
-
22
-
23
- def get_api_key() -> str:
24
- api_key = os.getenv(API_KEY, None)
25
- assert api_key
26
- return api_key
27
-
28
-
29
- @requires_env(API_KEY)
30
- def test_vertexai_embedder(embedder_file: Path):
31
- api_key = get_api_key()
32
- embedder_config = EmbedderConfig(embedding_provider="vertexai", embedding_api_key=api_key)
33
- embedder = Embedder(config=embedder_config)
34
- results = embedder.run(elements_filepath=embedder_file)
35
- assert results
36
- with embedder_file.open("r") as f:
37
- original_elements = json.load(f)
38
- validate_embedding_output(original_elements=original_elements, output_elements=results)
39
-
40
-
41
- @requires_env(API_KEY)
42
- def test_raw_vertexai_embedder(embedder_file: Path):
43
- api_key = get_api_key()
44
- embedder = VertexAIEmbeddingEncoder(
45
- config=VertexAIEmbeddingConfig(
46
- api_key=api_key,
47
- )
48
- )
49
- validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=768)
50
-
51
-
52
- @requires_env(API_KEY)
53
- @pytest.mark.asyncio
54
- async def test_raw_async_vertexai_embedder(embedder_file: Path):
55
- api_key = get_api_key()
56
- embedder = AsyncVertexAIEmbeddingEncoder(
57
- config=VertexAIEmbeddingConfig(
58
- api_key=api_key,
59
- )
60
- )
61
- await validate_raw_embedder_async(
62
- embedder=embedder, embedder_file=embedder_file, expected_dimension=768
63
- )
@@ -1,79 +0,0 @@
1
- import json
2
- import os
3
- from pathlib import Path
4
-
5
- import pytest
6
-
7
- from test.integration.embedders.utils import (
8
- validate_embedding_output,
9
- validate_raw_embedder,
10
- validate_raw_embedder_async,
11
- )
12
- from test.integration.utils import requires_env
13
- from unstructured_ingest.embed.voyageai import (
14
- AsyncVoyageAIEmbeddingEncoder,
15
- VoyageAIEmbeddingConfig,
16
- VoyageAIEmbeddingEncoder,
17
- )
18
- from unstructured_ingest.processes.embedder import Embedder, EmbedderConfig
19
-
20
- API_KEY = "VOYAGEAI_API_KEY"
21
-
22
-
23
- def get_api_key() -> str:
24
- api_key = os.getenv(API_KEY, None)
25
- assert api_key
26
- return api_key
27
-
28
-
29
- @requires_env(API_KEY)
30
- def test_voyageai_embedder(embedder_file: Path):
31
- api_key = get_api_key()
32
- embedder_config = EmbedderConfig(embedding_provider="voyageai", embedding_api_key=api_key)
33
- embedder = Embedder(config=embedder_config)
34
- results = embedder.run(elements_filepath=embedder_file)
35
- assert results
36
- with embedder_file.open("r") as f:
37
- original_elements = json.load(f)
38
- validate_embedding_output(original_elements=original_elements, output_elements=results)
39
-
40
-
41
- @requires_env(API_KEY)
42
- def test_raw_voyageai_embedder(embedder_file: Path):
43
- api_key = get_api_key()
44
- embedder = VoyageAIEmbeddingEncoder(
45
- config=VoyageAIEmbeddingConfig(
46
- api_key=api_key,
47
- )
48
- )
49
- validate_raw_embedder(embedder=embedder, embedder_file=embedder_file, expected_dimension=1024)
50
-
51
-
52
- @requires_env(API_KEY)
53
- @pytest.mark.asyncio
54
- async def test_raw_async_voyageai_embedder(embedder_file: Path):
55
- api_key = get_api_key()
56
- embedder = AsyncVoyageAIEmbeddingEncoder(
57
- config=VoyageAIEmbeddingConfig(
58
- api_key=api_key,
59
- )
60
- )
61
- await validate_raw_embedder_async(
62
- embedder=embedder, embedder_file=embedder_file, expected_dimension=1024
63
- )
64
-
65
-
66
- @requires_env(API_KEY)
67
- def test_voyageai_multimodal_embedder(embedder_file: Path):
68
- api_key = get_api_key()
69
- embedder_config = EmbedderConfig(
70
- embedding_provider="voyageai",
71
- embedding_api_key=api_key,
72
- embedding_model_name="voyage-multimodal-3",
73
- )
74
- embedder = Embedder(config=embedder_config)
75
- results = embedder.run(elements_filepath=embedder_file)
76
- assert results
77
- with embedder_file.open("r") as f:
78
- original_elements = json.load(f)
79
- validate_embedding_output(original_elements=original_elements, output_elements=results)
@@ -1,66 +0,0 @@
1
- import json
2
- from pathlib import Path
3
- from typing import Optional
4
-
5
- from unstructured_ingest.embed.interfaces import AsyncBaseEmbeddingEncoder, BaseEmbeddingEncoder
6
-
7
-
8
- def validate_embedding_output(original_elements: list[dict], output_elements: list[dict]):
9
- """
10
- Make sure the following characteristics are met:
11
- * The same number of elements are returned
12
- * For each element that had text, an embeddings entry was added in the output
13
- * Other than the embedding, nothing about the element was changed
14
- """
15
- assert len(original_elements) == len(output_elements)
16
- for original_element, output_element in zip(original_elements, output_elements):
17
- if original_element.get("text"):
18
- assert output_element.get("embeddings", None)
19
- output_element.pop("embeddings", None)
20
- assert original_element == output_element
21
-
22
-
23
- def validate_raw_embedder(
24
- embedder: BaseEmbeddingEncoder,
25
- embedder_file: Path,
26
- expected_dimension: Optional[int] = None,
27
- expected_is_unit_vector: bool = True,
28
- ):
29
- with open(embedder_file) as f:
30
- elements = json.load(f)
31
- all_text = [element["text"] for element in elements]
32
- single_text = all_text[0]
33
- dimension = embedder.dimension
34
- if expected_dimension:
35
- assert (
36
- dimension == expected_dimension
37
- ), f"dimensions {dimension} didn't match expected: {expected_dimension}"
38
- is_unit_vector = embedder.is_unit_vector
39
- assert is_unit_vector == expected_is_unit_vector
40
- single_embedding = embedder.embed_query(query=single_text)
41
- assert len(single_embedding) == dimension
42
- embedded_elements = embedder.embed_documents(elements=elements)
43
- validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
44
-
45
-
46
- async def validate_raw_embedder_async(
47
- embedder: AsyncBaseEmbeddingEncoder,
48
- embedder_file: Path,
49
- expected_dimension: Optional[int] = None,
50
- expected_is_unit_vector: bool = True,
51
- ):
52
- with open(embedder_file) as f:
53
- elements = json.load(f)
54
- all_text = [element["text"] for element in elements]
55
- single_text = all_text[0]
56
- dimension = await embedder.dimension
57
- if expected_dimension:
58
- assert (
59
- dimension == expected_dimension
60
- ), f"dimension {dimension} didn't match expected: {expected_dimension}"
61
- is_unit_vector = await embedder.is_unit_vector
62
- assert is_unit_vector == expected_is_unit_vector
63
- single_embedding = await embedder.embed_query(query=single_text)
64
- assert len(single_embedding) == dimension
65
- embedded_elements = await embedder.embed_documents(elements=elements)
66
- validate_embedding_output(original_elements=elements, output_elements=embedded_elements)
File without changes
@@ -1,76 +0,0 @@
1
- import os
2
- from pathlib import Path
3
-
4
- import pytest
5
-
6
- from test.integration.utils import requires_env
7
- from unstructured_ingest.errors_v2 import UserError
8
- from unstructured_ingest.processes.partitioner import Partitioner, PartitionerConfig
9
-
10
- int_test_dir = Path(__file__).parent
11
- assets_dir = int_test_dir / "assets"
12
-
13
- all_partition_files = [path for path in assets_dir.iterdir() if path.is_file()]
14
- non_image_partition_files = [
15
- path for path in all_partition_files if path.suffix not in [".jpg", ".png", ".tif"]
16
- ]
17
- supported_fast_partition_files = [
18
- path for path in non_image_partition_files if path.suffix != ".eml"
19
- ]
20
- image_partition_files = [
21
- path for path in all_partition_files if path not in non_image_partition_files
22
- ]
23
-
24
-
25
- @pytest.mark.parametrize(
26
- "partition_file", all_partition_files, ids=[path.name for path in all_partition_files]
27
- )
28
- @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
29
- @pytest.mark.asyncio
30
- async def test_partitioner_api_hi_res(partition_file: Path):
31
- api_key = os.getenv("UNSTRUCTURED_API_KEY")
32
- api_url = os.getenv("UNSTRUCTURED_API_URL")
33
- partitioner_config = PartitionerConfig(
34
- strategy="hi_res", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
35
- )
36
- partitioner = Partitioner(config=partitioner_config)
37
- results = await partitioner.run_async(filename=partition_file)
38
- assert results
39
-
40
-
41
- @pytest.mark.parametrize(
42
- "partition_file",
43
- supported_fast_partition_files,
44
- ids=[path.name for path in supported_fast_partition_files],
45
- )
46
- @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
47
- @pytest.mark.asyncio
48
- async def test_partitioner_api_fast(partition_file: Path):
49
- api_key = os.getenv("UNSTRUCTURED_API_KEY")
50
- api_url = os.getenv("UNSTRUCTURED_API_URL")
51
- partitioner_config = PartitionerConfig(
52
- strategy="fast", partition_by_api=True, api_key=api_key, partition_endpoint=api_url
53
- )
54
- partitioner = Partitioner(config=partitioner_config)
55
- results = await partitioner.run_async(filename=partition_file)
56
- assert results
57
-
58
-
59
- @pytest.mark.parametrize(
60
- "partition_file", image_partition_files, ids=[path.name for path in image_partition_files]
61
- )
62
- @requires_env("UNSTRUCTURED_API_KEY", "UNSTRUCTURED_API_URL")
63
- @pytest.mark.asyncio
64
- async def test_partitioner_api_fast_error(partition_file: Path):
65
- api_key = os.getenv("UNSTRUCTURED_API_KEY")
66
- api_url = os.getenv("UNSTRUCTURED_API_URL")
67
- partitioner_config = PartitionerConfig(
68
- strategy="fast",
69
- partition_by_api=True,
70
- api_key=api_key,
71
- partition_endpoint=api_url,
72
- raise_unsupported_filetype=True,
73
- )
74
- partitioner = Partitioner(config=partitioner_config)
75
- with pytest.raises(UserError):
76
- await partitioner.run_async(filename=partition_file)
test/integration/utils.py DELETED
@@ -1,15 +0,0 @@
1
- import os
2
-
3
- import pytest
4
-
5
-
6
- def requires_env(*envs):
7
- if len(envs) == 1:
8
- env = envs[0]
9
- return pytest.mark.skipif(
10
- env not in os.environ, reason=f"Environment variable not set: {env}"
11
- )
12
- return pytest.mark.skipif(
13
- not all(env in os.environ for env in envs),
14
- reason="All required environment variables not set: {}".format(", ".join(envs)),
15
- )
test/unit/__init__.py DELETED
File without changes
File without changes
@@ -1,49 +0,0 @@
1
- import random
2
-
3
- import faker
4
- import pytest
5
-
6
- from unstructured_ingest.processes.chunker import Chunker, ChunkerConfig
7
-
8
- fake = faker.Faker()
9
-
10
-
11
- def generate_chunker_config_params() -> dict:
12
- params = {}
13
- random_val = random.random()
14
- if random_val < 0.5:
15
- params["chunking_strategy"] = fake.word() if random.random() < 0.5 else None
16
- params["chunk_combine_text_under_n_chars"] = (
17
- fake.random_int() if random.random() < 0.5 else None
18
- )
19
- params["chunk_include_orig_elements"] = fake.boolean() if random.random() < 0.5 else None
20
- params["chunk_max_characters"] = fake.random_int()
21
- params["chunk_multipage_sections"] = fake.boolean()
22
- params["chunk_new_after_n_chars"] = fake.random_int() if random.random() < 0.5 else None
23
- params["chunk_overlap"] = fake.random_int() if random.random() < 0.5 else None
24
- params["chunk_overlap_all"] = fake.boolean() if random.random() < 0.5 else None
25
- if random_val < 0.5:
26
- params["chunk_by_api"] = True
27
- params["chunking_endpoint"] = fake.url()
28
- params["chunk_api_key"] = fake.password()
29
- else:
30
- params["chunk_by_api"] = False
31
-
32
- return params
33
-
34
-
35
- @pytest.mark.parametrize(
36
- "partition_config_params", [generate_chunker_config_params() for i in range(10)]
37
- )
38
- def test_chunker_config(partition_config_params: dict):
39
- chunker_config = ChunkerConfig.model_validate(partition_config_params)
40
- assert chunker_config
41
-
42
-
43
- @pytest.mark.parametrize(
44
- "partition_config_params", [generate_chunker_config_params() for i in range(10)]
45
- )
46
- def test_chunker(partition_config_params: dict):
47
- chunker_config = ChunkerConfig.model_validate(partition_config_params)
48
- chunker = Chunker(config=chunker_config)
49
- assert chunker
File without changes
File without changes