unstructured-ingest 0.2.2__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (93) hide show
  1. test/integration/connectors/elasticsearch/__init__.py +0 -0
  2. test/integration/connectors/elasticsearch/conftest.py +34 -0
  3. test/integration/connectors/elasticsearch/test_elasticsearch.py +308 -0
  4. test/integration/connectors/elasticsearch/test_opensearch.py +302 -0
  5. test/integration/connectors/sql/test_postgres.py +10 -4
  6. test/integration/connectors/sql/test_singlestore.py +8 -4
  7. test/integration/connectors/sql/test_snowflake.py +10 -6
  8. test/integration/connectors/sql/test_sqlite.py +4 -4
  9. test/integration/connectors/test_astradb.py +156 -0
  10. test/integration/connectors/test_azure_cog_search.py +233 -0
  11. test/integration/connectors/test_delta_table.py +46 -0
  12. test/integration/connectors/test_kafka.py +150 -16
  13. test/integration/connectors/test_lancedb.py +209 -0
  14. test/integration/connectors/test_milvus.py +141 -0
  15. test/integration/connectors/test_pinecone.py +213 -0
  16. test/integration/connectors/test_s3.py +23 -0
  17. test/integration/connectors/utils/docker.py +81 -15
  18. test/integration/connectors/utils/validation.py +10 -0
  19. test/integration/connectors/weaviate/__init__.py +0 -0
  20. test/integration/connectors/weaviate/conftest.py +15 -0
  21. test/integration/connectors/weaviate/test_local.py +131 -0
  22. test/unit/v2/__init__.py +0 -0
  23. test/unit/v2/chunkers/__init__.py +0 -0
  24. test/unit/v2/chunkers/test_chunkers.py +49 -0
  25. test/unit/v2/connectors/__init__.py +0 -0
  26. test/unit/v2/embedders/__init__.py +0 -0
  27. test/unit/v2/embedders/test_bedrock.py +36 -0
  28. test/unit/v2/embedders/test_huggingface.py +48 -0
  29. test/unit/v2/embedders/test_mixedbread.py +37 -0
  30. test/unit/v2/embedders/test_octoai.py +35 -0
  31. test/unit/v2/embedders/test_openai.py +35 -0
  32. test/unit/v2/embedders/test_togetherai.py +37 -0
  33. test/unit/v2/embedders/test_vertexai.py +37 -0
  34. test/unit/v2/embedders/test_voyageai.py +38 -0
  35. test/unit/v2/partitioners/__init__.py +0 -0
  36. test/unit/v2/partitioners/test_partitioner.py +63 -0
  37. test/unit/v2/utils/__init__.py +0 -0
  38. test/unit/v2/utils/data_generator.py +32 -0
  39. unstructured_ingest/__version__.py +1 -1
  40. unstructured_ingest/cli/cmds/__init__.py +2 -2
  41. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  42. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  43. unstructured_ingest/pipeline/reformat/embedding.py +1 -1
  44. unstructured_ingest/runner/writers/__init__.py +2 -2
  45. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  46. unstructured_ingest/utils/data_prep.py +9 -1
  47. unstructured_ingest/v2/constants.py +2 -0
  48. unstructured_ingest/v2/processes/connectors/__init__.py +7 -20
  49. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  50. unstructured_ingest/v2/processes/connectors/astradb.py +35 -23
  51. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +116 -35
  52. unstructured_ingest/v2/processes/connectors/confluence.py +2 -2
  53. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  54. unstructured_ingest/v2/processes/connectors/delta_table.py +37 -9
  55. unstructured_ingest/v2/processes/connectors/elasticsearch/__init__.py +19 -0
  56. unstructured_ingest/v2/processes/connectors/{elasticsearch.py → elasticsearch/elasticsearch.py} +93 -46
  57. unstructured_ingest/v2/processes/connectors/{opensearch.py → elasticsearch/opensearch.py} +1 -1
  58. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +27 -0
  59. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  60. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +6 -2
  61. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +38 -2
  62. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +84 -23
  63. unstructured_ingest/v2/processes/connectors/kafka/local.py +32 -4
  64. unstructured_ingest/v2/processes/connectors/lancedb/__init__.py +17 -0
  65. unstructured_ingest/v2/processes/connectors/lancedb/aws.py +43 -0
  66. unstructured_ingest/v2/processes/connectors/lancedb/azure.py +43 -0
  67. unstructured_ingest/v2/processes/connectors/lancedb/gcp.py +44 -0
  68. unstructured_ingest/v2/processes/connectors/lancedb/lancedb.py +161 -0
  69. unstructured_ingest/v2/processes/connectors/lancedb/local.py +44 -0
  70. unstructured_ingest/v2/processes/connectors/milvus.py +72 -27
  71. unstructured_ingest/v2/processes/connectors/onedrive.py +2 -3
  72. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  73. unstructured_ingest/v2/processes/connectors/pinecone.py +101 -13
  74. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  75. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  76. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  77. unstructured_ingest/v2/processes/connectors/sql/sql.py +97 -26
  78. unstructured_ingest/v2/processes/connectors/weaviate/__init__.py +22 -0
  79. unstructured_ingest/v2/processes/connectors/weaviate/cloud.py +164 -0
  80. unstructured_ingest/v2/processes/connectors/weaviate/embedded.py +90 -0
  81. unstructured_ingest/v2/processes/connectors/weaviate/local.py +73 -0
  82. unstructured_ingest/v2/processes/connectors/weaviate/weaviate.py +289 -0
  83. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/METADATA +20 -19
  84. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/RECORD +91 -50
  85. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  86. unstructured_ingest/v2/processes/connectors/weaviate.py +0 -242
  87. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  88. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  89. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  90. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/LICENSE.md +0 -0
  91. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/WHEEL +0 -0
  92. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/entry_points.txt +0 -0
  93. {unstructured_ingest-0.2.2.dist-info → unstructured_ingest-0.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,48 @@
1
+ import random
2
+ from typing import Any
3
+
4
+ import faker
5
+ import pytest
6
+
7
+ from test.unit.v2.utils.data_generator import generate_random_dictionary
8
+ from unstructured_ingest.embed.huggingface import (
9
+ HuggingFaceEmbeddingConfig,
10
+ HuggingFaceEmbeddingEncoder,
11
+ )
12
+
13
+ fake = faker.Faker()
14
+
15
+
16
+ def generate_embedder_config_params() -> dict:
17
+ params = {}
18
+ if random.random() < 0.5:
19
+ params["embed_model_name"] = fake.word() if random.random() < 0.5 else None
20
+ params["embedder_model_kwargs"] = (
21
+ generate_random_dictionary(key_type=str, value_type=Any)
22
+ if random.random() < 0.5
23
+ else None
24
+ )
25
+ params["encode_kwargs"] = (
26
+ generate_random_dictionary(key_type=str, value_type=Any)
27
+ if random.random() < 0.5
28
+ else None
29
+ )
30
+ params["cache_folder"] = fake.file_path() if random.random() < 0.5 else None
31
+ return params
32
+
33
+
34
+ @pytest.mark.parametrize(
35
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
36
+ )
37
+ def test_embedder_config(embedder_config_params: dict):
38
+ embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
39
+ assert embedder_config
40
+
41
+
42
+ @pytest.mark.parametrize(
43
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
44
+ )
45
+ def test_embedder(embedder_config_params: dict):
46
+ embedder_config = HuggingFaceEmbeddingConfig.model_validate(embedder_config_params)
47
+ embedder = HuggingFaceEmbeddingEncoder(config=embedder_config)
48
+ assert embedder
@@ -0,0 +1,37 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.mixedbreadai import (
7
+ MixedbreadAIEmbeddingConfig,
8
+ MixedbreadAIEmbeddingEncoder,
9
+ )
10
+
11
+ fake = faker.Faker()
12
+
13
+
14
+ def generate_embedder_config_params() -> dict:
15
+ params = {
16
+ "api_key": fake.password(),
17
+ }
18
+ if random.random() < 0.5:
19
+ params["embedder_model_name"] = fake.word()
20
+ return params
21
+
22
+
23
+ @pytest.mark.parametrize(
24
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
+ )
26
+ def test_embedder_config(embedder_config_params: dict):
27
+ embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
28
+ assert embedder_config
29
+
30
+
31
+ @pytest.mark.parametrize(
32
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
+ )
34
+ def test_embedder(embedder_config_params: dict):
35
+ embedder_config = MixedbreadAIEmbeddingConfig.model_validate(embedder_config_params)
36
+ embedder = MixedbreadAIEmbeddingEncoder(config=embedder_config)
37
+ assert embedder
@@ -0,0 +1,35 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.octoai import OctoAiEmbeddingConfig, OctoAIEmbeddingEncoder
7
+
8
+ fake = faker.Faker()
9
+
10
+
11
+ def generate_embedder_config_params() -> dict:
12
+ params = {
13
+ "api_key": fake.password(),
14
+ }
15
+ if random.random() < 0.5:
16
+ params["embedder_model_name"] = fake.word()
17
+ params["base_url"] = fake.url()
18
+ return params
19
+
20
+
21
+ @pytest.mark.parametrize(
22
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
23
+ )
24
+ def test_embedder_config(embedder_config_params: dict):
25
+ embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
26
+ assert embedder_config
27
+
28
+
29
+ @pytest.mark.parametrize(
30
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
31
+ )
32
+ def test_embedder(embedder_config_params: dict):
33
+ embedder_config = OctoAiEmbeddingConfig.model_validate(embedder_config_params)
34
+ embedder = OctoAIEmbeddingEncoder(config=embedder_config)
35
+ assert embedder
@@ -0,0 +1,35 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder
7
+
8
+ fake = faker.Faker()
9
+
10
+
11
+ def generate_embedder_config_params() -> dict:
12
+ params = {
13
+ "api_key": fake.password(),
14
+ }
15
+ if random.random() < 0.5:
16
+ params["embedder_model_name"] = fake.word()
17
+ params["base_url"] = fake.url()
18
+ return params
19
+
20
+
21
+ @pytest.mark.parametrize(
22
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
23
+ )
24
+ def test_embedder_config(embedder_config_params: dict):
25
+ embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
26
+ assert embedder_config
27
+
28
+
29
+ @pytest.mark.parametrize(
30
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
31
+ )
32
+ def test_embedder(embedder_config_params: dict):
33
+ embedder_config = OpenAIEmbeddingConfig.model_validate(embedder_config_params)
34
+ embedder = OpenAIEmbeddingEncoder(config=embedder_config)
35
+ assert embedder
@@ -0,0 +1,37 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.togetherai import (
7
+ TogetherAIEmbeddingConfig,
8
+ TogetherAIEmbeddingEncoder,
9
+ )
10
+
11
+ fake = faker.Faker()
12
+
13
+
14
+ def generate_embedder_config_params() -> dict:
15
+ params = {
16
+ "api_key": fake.password(),
17
+ }
18
+ if random.random() < 0.5:
19
+ params["embedder_model_name"] = fake.word()
20
+ return params
21
+
22
+
23
+ @pytest.mark.parametrize(
24
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
+ )
26
+ def test_embedder_config(embedder_config_params: dict):
27
+ embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
28
+ assert embedder_config
29
+
30
+
31
+ @pytest.mark.parametrize(
32
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
+ )
34
+ def test_embedder(embedder_config_params: dict):
35
+ embedder_config = TogetherAIEmbeddingConfig.model_validate(embedder_config_params)
36
+ embedder = TogetherAIEmbeddingEncoder(config=embedder_config)
37
+ assert embedder
@@ -0,0 +1,37 @@
1
+ import json
2
+ import random
3
+ from typing import Any
4
+
5
+ import faker
6
+ import pytest
7
+
8
+ from test.unit.v2.utils.data_generator import generate_random_dictionary
9
+ from unstructured_ingest.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
10
+
11
+ fake = faker.Faker()
12
+
13
+
14
+ def generate_embedder_config_params() -> dict:
15
+ params = {
16
+ "api_key": json.dumps(generate_random_dictionary(key_type=str, value_type=Any)),
17
+ }
18
+ if random.random() < 0.5:
19
+ params["embedder_model_name"] = fake.word()
20
+ return params
21
+
22
+
23
+ @pytest.mark.parametrize(
24
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
25
+ )
26
+ def test_embedder_config(embedder_config_params: dict):
27
+ embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
28
+ assert embedder_config
29
+
30
+
31
+ @pytest.mark.parametrize(
32
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
33
+ )
34
+ def test_embedder(embedder_config_params: dict):
35
+ embedder_config = VertexAIEmbeddingConfig.model_validate(embedder_config_params)
36
+ embedder = VertexAIEmbeddingEncoder(config=embedder_config)
37
+ assert embedder
@@ -0,0 +1,38 @@
1
+ import random
2
+
3
+ import faker
4
+ import pytest
5
+
6
+ from unstructured_ingest.embed.voyageai import VoyageAIEmbeddingConfig, VoyageAIEmbeddingEncoder
7
+
8
+ fake = faker.Faker()
9
+
10
+
11
+ def generate_embedder_config_params() -> dict:
12
+ params = {
13
+ "api_key": fake.password(),
14
+ }
15
+ if random.random() < 0.5:
16
+ params["embedder_model_name"] = fake.word()
17
+ params["batch_size"] = fake.random_int()
18
+ params["truncation"] = fake.boolean()
19
+ params["max_retries"] = fake.random_int()
20
+ params["timeout_in_seconds"] = fake.random_int()
21
+ return params
22
+
23
+
24
+ @pytest.mark.parametrize(
25
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
26
+ )
27
+ def test_embedder_config(embedder_config_params: dict):
28
+ embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
29
+ assert embedder_config
30
+
31
+
32
+ @pytest.mark.parametrize(
33
+ "embedder_config_params", [generate_embedder_config_params() for i in range(10)]
34
+ )
35
+ def test_embedder(embedder_config_params: dict):
36
+ embedder_config = VoyageAIEmbeddingConfig.model_validate(embedder_config_params)
37
+ embedder = VoyageAIEmbeddingEncoder(config=embedder_config)
38
+ assert embedder
File without changes
@@ -0,0 +1,63 @@
1
+ import random
2
+ from typing import Any
3
+
4
+ import faker
5
+ import pytest
6
+
7
+ from test.unit.v2.utils.data_generator import generate_random_dictionary
8
+ from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
9
+
10
+ fake = faker.Faker()
11
+
12
+
13
+ def generate_partitioner_config_params() -> dict:
14
+ params = {
15
+ "strategy": random.choice(["fast", "hi_res", "auto"]),
16
+ "ocr_languages": fake.words() if random.random() < 0.5 else None,
17
+ "encoding": fake.word() if random.random() < 0.5 else None,
18
+ "additional_partition_args": (
19
+ generate_random_dictionary(key_type=str, value_type=Any)
20
+ if random.random() < 0.5
21
+ else None
22
+ ),
23
+ "skip_infer_table_types": fake.words() if random.random() < 0.5 else None,
24
+ "flatten_metadata": fake.boolean(),
25
+ "hi_res_model_name": fake.word() if random.random() < 0.5 else None,
26
+ }
27
+ random_val = random.random()
28
+ # Randomly set the fields_include to a random list[str]
29
+ if random_val < 0.5:
30
+ params["fields_include"] = fake.words()
31
+
32
+ # Randomly set the metadata_exclude or metadata_include to a valid
33
+ # list[str] or don't set it at all
34
+ if random.random() < (1 / 3):
35
+ params["metadata_exclude"] = fake.words()
36
+ elif random_val < (2 / 3):
37
+ params["metadata_include"] = fake.words()
38
+
39
+ # Randomly set the values associated with calling the api, or not at all
40
+ if random.random() < 0.5:
41
+ params["partition_by_api"]: True
42
+ params["partition_endpoint"] = fake.url()
43
+ params["api_key"] = fake.password()
44
+ else:
45
+ params["partition_by_api"]: False
46
+ return params
47
+
48
+
49
+ @pytest.mark.parametrize(
50
+ "partition_config_params", [generate_partitioner_config_params() for i in range(10)]
51
+ )
52
+ def test_partition_config(partition_config_params: dict):
53
+ partition_config = PartitionerConfig.model_validate(partition_config_params)
54
+ assert partition_config
55
+
56
+
57
+ @pytest.mark.parametrize(
58
+ "partition_config_params", [generate_partitioner_config_params() for i in range(10)]
59
+ )
60
+ def test_partitioner(partition_config_params: dict):
61
+ partition_config = PartitionerConfig.model_validate(partition_config_params)
62
+ partitioner = Partitioner(config=partition_config)
63
+ assert partitioner
File without changes
@@ -0,0 +1,32 @@
1
+ import random
2
+ from typing import Any, Type
3
+
4
+ from faker import Faker
5
+
6
+ fake = Faker()
7
+
8
+ type_to_random_value_map = {
9
+ str: fake.sentence,
10
+ int: fake.random_int,
11
+ float: fake.random_digit,
12
+ bool: fake.boolean,
13
+ }
14
+ type_to_random_value_map_key = type_to_random_value_map.copy()
15
+ type_to_random_value_map_key[str] = fake.word
16
+
17
+
18
+ def generate_random_dictionary(key_type: Type = str, value_type: Type = str) -> dict:
19
+ d = {}
20
+ num_keys = random.randint(1, 3)
21
+ for i in range(num_keys):
22
+ key = type_to_random_value_map_key[key_type]()
23
+ current_value_type = value_type
24
+ if current_value_type == Any:
25
+ current_value_type = random.choice(list(type_to_random_value_map.keys()) + [dict])
26
+ value = (
27
+ generate_random_dictionary(key_type=key_type, value_type=value_type)
28
+ if current_value_type is dict
29
+ else type_to_random_value_map[current_value_type]()
30
+ )
31
+ d[key] = value
32
+ return d
@@ -1 +1 @@
1
- __version__ = "0.2.2" # pragma: no cover
1
+ __version__ = "0.3.1" # pragma: no cover
@@ -9,7 +9,7 @@ from unstructured_ingest.cli.cmds.fsspec.sftp import get_base_src_cmd as sftp_ba
9
9
  from .airtable import get_base_src_cmd as airtable_base_src_cmd
10
10
  from .astradb import get_base_dest_cmd as astradb_base_dest_cmd
11
11
  from .astradb import get_base_src_cmd as astradb_base_src_cmd
12
- from .azure_cognitive_search import get_base_dest_cmd as azure_cognitive_search_base_dest_cmd
12
+ from .azure_ai_search import get_base_dest_cmd as azure_ai_search_base_dest_cmd
13
13
  from .biomed import get_base_src_cmd as biomed_base_src_cmd
14
14
  from .chroma import get_base_dest_cmd as chroma_base_dest_cmd
15
15
  from .clarifai import get_base_dest_cmd as clarifai_base_dest_cmd
@@ -118,7 +118,7 @@ base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
118
118
  gcs_base_dest_cmd,
119
119
  kafka_base_dest_cmd,
120
120
  s3_base_dest_cmd,
121
- azure_cognitive_search_base_dest_cmd,
121
+ azure_ai_search_base_dest_cmd,
122
122
  delta_table_dest_cmd,
123
123
  sql_base_dest_cmd,
124
124
  weaviate_dest_cmd,
@@ -6,14 +6,14 @@ import click
6
6
  from unstructured_ingest.cli.interfaces import (
7
7
  CliConfig,
8
8
  )
9
- from unstructured_ingest.connector.azure_cognitive_search import (
10
- AzureCognitiveSearchWriteConfig,
11
- SimpleAzureCognitiveSearchStorageConfig,
9
+ from unstructured_ingest.connector.azure_ai_search import (
10
+ AzureAISearchWriteConfig,
11
+ SimpleAzureAISearchStorageConfig,
12
12
  )
13
13
 
14
14
 
15
15
  @dataclass
16
- class AzureCognitiveSearchCliConfig(SimpleAzureCognitiveSearchStorageConfig, CliConfig):
16
+ class AzureAISearchCliConfig(SimpleAzureAISearchStorageConfig, CliConfig):
17
17
  @staticmethod
18
18
  def get_cli_options() -> t.List[click.Option]:
19
19
  options = [
@@ -39,7 +39,7 @@ class AzureCognitiveSearchCliConfig(SimpleAzureCognitiveSearchStorageConfig, Cli
39
39
 
40
40
 
41
41
  @dataclass
42
- class AzureCognitiveSearchCliWriteConfig(AzureCognitiveSearchWriteConfig, CliConfig):
42
+ class AzureAISearchCliWriteConfig(AzureAISearchWriteConfig, CliConfig):
43
43
  @staticmethod
44
44
  def get_cli_options() -> t.List[click.Option]:
45
45
  options = [
@@ -57,9 +57,9 @@ def get_base_dest_cmd():
57
57
  from unstructured_ingest.cli.base.dest import BaseDestCmd
58
58
 
59
59
  cmd_cls = BaseDestCmd(
60
- cmd_name="azure-cognitive-search",
61
- cli_config=AzureCognitiveSearchCliConfig,
62
- additional_cli_options=[AzureCognitiveSearchCliWriteConfig],
63
- write_config=AzureCognitiveSearchCliWriteConfig,
60
+ cmd_name="azure-ai-search",
61
+ cli_config=AzureAISearchCliConfig,
62
+ additional_cli_options=[AzureAISearchCliWriteConfig],
63
+ write_config=AzureAISearchCliWriteConfig,
64
64
  )
65
65
  return cmd_cls
@@ -19,28 +19,28 @@ if t.TYPE_CHECKING:
19
19
 
20
20
 
21
21
  @dataclass
22
- class AzureCognitiveSearchAccessConfig(AccessConfig):
22
+ class AzureAiSearchAccessConfig(AccessConfig):
23
23
  key: str = enhanced_field(sensitive=True)
24
24
 
25
25
 
26
26
  @dataclass
27
- class SimpleAzureCognitiveSearchStorageConfig(BaseConnectorConfig):
27
+ class SimpleAzureAISearchStorageConfig(BaseConnectorConfig):
28
28
  endpoint: str
29
- access_config: AzureCognitiveSearchAccessConfig
29
+ access_config: AzureAiSearchAccessConfig
30
30
 
31
31
 
32
32
  @dataclass
33
- class AzureCognitiveSearchWriteConfig(WriteConfig):
33
+ class AzureAISearchWriteConfig(WriteConfig):
34
34
  index: str
35
35
 
36
36
 
37
37
  @dataclass
38
- class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
39
- write_config: AzureCognitiveSearchWriteConfig
40
- connector_config: SimpleAzureCognitiveSearchStorageConfig
38
+ class AzureAISearchDestinationConnector(BaseDestinationConnector):
39
+ write_config: AzureAISearchWriteConfig
40
+ connector_config: SimpleAzureAISearchStorageConfig
41
41
  _client: t.Optional["SearchClient"] = field(init=False, default=None)
42
42
 
43
- @requires_dependencies(["azure.search"], extras="azure-cognitive-search")
43
+ @requires_dependencies(["azure.search"], extras="azure-ai-search")
44
44
  def generate_client(self) -> "SearchClient":
45
45
  from azure.core.credentials import AzureKeyCredential
46
46
  from azure.search.documents import SearchClient
@@ -112,7 +112,7 @@ class AzureCognitiveSearchDestinationConnector(BaseDestinationConnector):
112
112
  if page_number := data.get("metadata", {}).get("page_number"):
113
113
  data["metadata"]["page_number"] = str(page_number)
114
114
 
115
- @requires_dependencies(["azure"], extras="azure-cognitive-search")
115
+ @requires_dependencies(["azure"], extras="azure-ai-search")
116
116
  def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]], **kwargs) -> None:
117
117
  import azure.core.exceptions
118
118
 
@@ -61,4 +61,4 @@ class Embedder(ReformatNode):
61
61
  return None
62
62
 
63
63
  def get_path(self) -> Path:
64
- return (Path(self.pipeline_context.work_dir) / "embedded").resolve()
64
+ return (Path(self.pipeline_context.work_dir) / "embedded.py").resolve()
@@ -1,7 +1,7 @@
1
1
  import typing as t
2
2
 
3
3
  from .astradb import AstraDBWriter
4
- from .azure_cognitive_search import AzureCognitiveSearchWriter
4
+ from .azure_ai_search import AzureAiSearchWriter
5
5
  from .base_writer import Writer
6
6
  from .chroma import ChromaWriter
7
7
  from .clarifai import ClarifaiWriter
@@ -25,7 +25,7 @@ from .weaviate import WeaviateWriter
25
25
  writer_map: t.Dict[str, t.Type[Writer]] = {
26
26
  "astradb": AstraDBWriter,
27
27
  "azure": AzureWriter,
28
- "azure_cognitive_search": AzureCognitiveSearchWriter,
28
+ "azure_ai_search": AzureAiSearchWriter,
29
29
  "box": BoxWriter,
30
30
  "chroma": ChromaWriter,
31
31
  "clarifai": ClarifaiWriter,
@@ -0,0 +1,24 @@
1
+ import typing as t
2
+ from dataclasses import dataclass
3
+
4
+ from unstructured_ingest.interfaces import BaseDestinationConnector
5
+ from unstructured_ingest.runner.writers.base_writer import Writer
6
+
7
+ if t.TYPE_CHECKING:
8
+ from unstructured_ingest.connector.azure_ai_search import (
9
+ AzureAISearchWriteConfig,
10
+ SimpleAzureAISearchStorageConfig,
11
+ )
12
+
13
+
14
+ @dataclass
15
+ class AzureAiSearchWriter(Writer):
16
+ connector_config: "SimpleAzureAISearchStorageConfig"
17
+ write_config: "AzureAISearchWriteConfig"
18
+
19
+ def get_connector_cls(self) -> t.Type[BaseDestinationConnector]:
20
+ from unstructured_ingest.connector.azure_ai_search import (
21
+ AzureAISearchDestinationConnector,
22
+ )
23
+
24
+ return AzureAISearchDestinationConnector
@@ -1,7 +1,9 @@
1
1
  import itertools
2
2
  import json
3
3
  from datetime import datetime
4
- from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
4
+ from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
5
+
6
+ import pandas as pd
5
7
 
6
8
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
7
9
 
@@ -9,6 +11,12 @@ T = TypeVar("T")
9
11
  IterableT = Iterable[T]
10
12
 
11
13
 
14
+ def split_dataframe(df: pd.DataFrame, chunk_size: int = 100) -> Generator[pd.DataFrame, None, None]:
15
+ num_chunks = len(df) // chunk_size + 1
16
+ for i in range(num_chunks):
17
+ yield df[i * chunk_size : (i + 1) * chunk_size]
18
+
19
+
12
20
  def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
13
21
  """A helper function to break an iterable into batches of size batch_size."""
14
22
  it = iter(iterable)
@@ -0,0 +1,2 @@
1
+ # Used to append to metadata for uploaders that store element-level data
2
+ RECORD_ID_LABEL = "record_id"
@@ -1,10 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
4
+ import unstructured_ingest.v2.processes.connectors.elasticsearch # noqa: F401
4
5
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
5
6
  import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
7
+ import unstructured_ingest.v2.processes.connectors.lancedb # noqa: F401
6
8
  import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
7
9
  import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
10
+ import unstructured_ingest.v2.processes.connectors.weaviate # noqa: F401
8
11
  from unstructured_ingest.v2.processes.connector_registry import (
9
12
  add_destination_entry,
10
13
  add_source_entry,
@@ -14,8 +17,8 @@ from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
14
17
  from .airtable import airtable_source_entry
15
18
  from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
16
19
  from .astradb import astra_db_destination_entry, astra_db_source_entry
17
- from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
18
- from .azure_cognitive_search import azure_cognitive_search_destination_entry
20
+ from .azure_ai_search import CONNECTOR_TYPE as AZURE_AI_SEARCH_CONNECTOR_TYPE
21
+ from .azure_ai_search import azure_ai_search_destination_entry
19
22
  from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
20
23
  from .chroma import chroma_destination_entry
21
24
  from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
@@ -24,8 +27,6 @@ from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
24
27
  from .couchbase import couchbase_destination_entry, couchbase_source_entry
25
28
  from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
26
29
  from .delta_table import delta_table_destination_entry
27
- from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
28
- from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
29
30
  from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
30
31
  from .gitlab import gitlab_source_entry
31
32
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
@@ -40,8 +41,6 @@ from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
40
41
  from .mongodb import mongodb_destination_entry, mongodb_source_entry
41
42
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
42
43
  from .onedrive import onedrive_destination_entry, onedrive_source_entry
43
- from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
44
- from .opensearch import opensearch_destination_entry, opensearch_source_entry
45
44
  from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
46
45
  from .outlook import outlook_source_entry
47
46
  from .pinecone import CONNECTOR_TYPE as PINECONE_CONNECTOR_TYPE
@@ -52,8 +51,6 @@ from .sharepoint import CONNECTOR_TYPE as SHAREPOINT_CONNECTOR_TYPE
52
51
  from .sharepoint import sharepoint_source_entry
53
52
  from .slack import CONNECTOR_TYPE as SLACK_CONNECTOR_TYPE
54
53
  from .slack import slack_source_entry
55
- from .weaviate import CONNECTOR_TYPE as WEAVIATE_CONNECTOR_TYPE
56
- from .weaviate import weaviate_destination_entry
57
54
 
58
55
  add_source_entry(source_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_source_entry)
59
56
  add_destination_entry(destination_type=ASTRA_DB_CONNECTOR_TYPE, entry=astra_db_destination_entry)
@@ -67,10 +64,6 @@ add_destination_entry(
67
64
  destination_type=DELTA_TABLE_CONNECTOR_TYPE, entry=delta_table_destination_entry
68
65
  )
69
66
 
70
- add_source_entry(source_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_source_entry)
71
- add_destination_entry(
72
- destination_type=ELASTICSEARCH_CONNECTOR_TYPE, entry=elasticsearch_destination_entry
73
- )
74
67
 
75
68
  add_source_entry(source_type=GOOGLE_DRIVE_CONNECTOR_TYPE, entry=google_drive_source_entry)
76
69
 
@@ -80,15 +73,9 @@ add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destina
80
73
  add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
81
74
  add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
82
75
 
83
- add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
84
- add_destination_entry(
85
- destination_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_destination_entry
86
- )
87
76
 
88
77
  add_source_entry(source_type=SALESFORCE_CONNECTOR_TYPE, entry=salesforce_source_entry)
89
78
 
90
- add_destination_entry(destination_type=WEAVIATE_CONNECTOR_TYPE, entry=weaviate_destination_entry)
91
-
92
79
  add_destination_entry(destination_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_destination_entry)
93
80
  add_source_entry(source_type=MONGODB_CONNECTOR_TYPE, entry=mongodb_source_entry)
94
81
 
@@ -97,8 +84,8 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
97
84
 
98
85
  add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
99
86
  add_destination_entry(
100
- destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
101
- entry=azure_cognitive_search_destination_entry,
87
+ destination_type=AZURE_AI_SEARCH_CONNECTOR_TYPE,
88
+ entry=azure_ai_search_destination_entry,
102
89
  )
103
90
 
104
91
  add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)