unstructured-ingest 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (78) hide show
  1. test/integration/connectors/test_astradb.py +109 -0
  2. test/integration/connectors/test_azure_cog_search.py +233 -0
  3. test/integration/connectors/test_confluence.py +113 -0
  4. test/integration/connectors/test_kafka.py +167 -0
  5. test/integration/connectors/test_onedrive.py +112 -0
  6. test/integration/connectors/test_pinecone.py +161 -0
  7. test/integration/connectors/test_qdrant.py +137 -0
  8. test/integration/connectors/test_s3.py +23 -0
  9. test/integration/connectors/utils/docker.py +2 -1
  10. test/integration/connectors/utils/validation.py +73 -22
  11. test/unit/v2/__init__.py +0 -0
  12. test/unit/v2/chunkers/__init__.py +0 -0
  13. test/unit/v2/chunkers/test_chunkers.py +49 -0
  14. test/unit/v2/connectors/__init__.py +0 -0
  15. test/unit/v2/embedders/__init__.py +0 -0
  16. test/unit/v2/embedders/test_bedrock.py +36 -0
  17. test/unit/v2/embedders/test_huggingface.py +48 -0
  18. test/unit/v2/embedders/test_mixedbread.py +37 -0
  19. test/unit/v2/embedders/test_octoai.py +35 -0
  20. test/unit/v2/embedders/test_openai.py +35 -0
  21. test/unit/v2/embedders/test_togetherai.py +37 -0
  22. test/unit/v2/embedders/test_vertexai.py +37 -0
  23. test/unit/v2/embedders/test_voyageai.py +38 -0
  24. test/unit/v2/partitioners/__init__.py +0 -0
  25. test/unit/v2/partitioners/test_partitioner.py +63 -0
  26. test/unit/v2/utils/__init__.py +0 -0
  27. test/unit/v2/utils/data_generator.py +32 -0
  28. unstructured_ingest/__version__.py +1 -1
  29. unstructured_ingest/cli/cmds/__init__.py +2 -2
  30. unstructured_ingest/cli/cmds/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  31. unstructured_ingest/connector/{azure_cognitive_search.py → azure_ai_search.py} +9 -9
  32. unstructured_ingest/connector/kafka.py +0 -1
  33. unstructured_ingest/interfaces.py +7 -7
  34. unstructured_ingest/runner/writers/__init__.py +2 -2
  35. unstructured_ingest/runner/writers/azure_ai_search.py +24 -0
  36. unstructured_ingest/v2/constants.py +2 -0
  37. unstructured_ingest/v2/processes/chunker.py +2 -2
  38. unstructured_ingest/v2/processes/connectors/__init__.py +16 -5
  39. unstructured_ingest/v2/processes/connectors/airtable.py +2 -2
  40. unstructured_ingest/v2/processes/connectors/astradb.py +33 -21
  41. unstructured_ingest/v2/processes/connectors/{azure_cognitive_search.py → azure_ai_search.py} +112 -35
  42. unstructured_ingest/v2/processes/connectors/confluence.py +195 -0
  43. unstructured_ingest/v2/processes/connectors/couchbase.py +1 -0
  44. unstructured_ingest/v2/processes/connectors/databricks/volumes.py +2 -4
  45. unstructured_ingest/v2/processes/connectors/delta_table.py +17 -5
  46. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -0
  47. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +28 -10
  48. unstructured_ingest/v2/processes/connectors/gitlab.py +267 -0
  49. unstructured_ingest/v2/processes/connectors/google_drive.py +3 -3
  50. unstructured_ingest/v2/processes/connectors/kafka/__init__.py +17 -0
  51. unstructured_ingest/v2/processes/connectors/kafka/cloud.py +118 -0
  52. unstructured_ingest/v2/processes/connectors/kafka/kafka.py +251 -0
  53. unstructured_ingest/v2/processes/connectors/kafka/local.py +103 -0
  54. unstructured_ingest/v2/processes/connectors/onedrive.py +165 -5
  55. unstructured_ingest/v2/processes/connectors/outlook.py +2 -2
  56. unstructured_ingest/v2/processes/connectors/pinecone.py +83 -12
  57. unstructured_ingest/v2/processes/connectors/qdrant/__init__.py +16 -0
  58. unstructured_ingest/v2/processes/connectors/qdrant/cloud.py +59 -0
  59. unstructured_ingest/v2/processes/connectors/qdrant/local.py +58 -0
  60. unstructured_ingest/v2/processes/connectors/qdrant/qdrant.py +168 -0
  61. unstructured_ingest/v2/processes/connectors/qdrant/server.py +60 -0
  62. unstructured_ingest/v2/processes/connectors/sharepoint.py +3 -2
  63. unstructured_ingest/v2/processes/connectors/slack.py +2 -2
  64. unstructured_ingest/v2/processes/connectors/sql/postgres.py +16 -8
  65. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +3 -1
  66. unstructured_ingest/v2/processes/connectors/sql/sql.py +2 -4
  67. unstructured_ingest/v2/processes/partitioner.py +14 -3
  68. unstructured_ingest/v2/unstructured_api.py +24 -10
  69. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/METADATA +17 -16
  70. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/RECORD +77 -41
  71. unstructured_ingest/runner/writers/azure_cognitive_search.py +0 -24
  72. /test/integration/embedders/{togetherai.py → test_togetherai.py} +0 -0
  73. /test/unit/{test_interfaces_v2.py → v2/test_interfaces.py} +0 -0
  74. /test/unit/{test_utils_v2.py → v2/test_utils.py} +0 -0
  75. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/LICENSE.md +0 -0
  76. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/WHEEL +0 -0
  77. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/entry_points.txt +0 -0
  78. {unstructured_ingest-0.2.1.dist-info → unstructured_ingest-0.3.0.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,8 @@ from __future__ import annotations
2
2
 
3
3
  import unstructured_ingest.v2.processes.connectors.databricks # noqa: F401
4
4
  import unstructured_ingest.v2.processes.connectors.fsspec # noqa: F401
5
+ import unstructured_ingest.v2.processes.connectors.kafka # noqa: F401
6
+ import unstructured_ingest.v2.processes.connectors.qdrant # noqa: F401
5
7
  import unstructured_ingest.v2.processes.connectors.sql # noqa: F401
6
8
  from unstructured_ingest.v2.processes.connector_registry import (
7
9
  add_destination_entry,
@@ -12,16 +14,20 @@ from .airtable import CONNECTOR_TYPE as AIRTABLE_CONNECTOR_TYPE
12
14
  from .airtable import airtable_source_entry
13
15
  from .astradb import CONNECTOR_TYPE as ASTRA_DB_CONNECTOR_TYPE
14
16
  from .astradb import astra_db_destination_entry, astra_db_source_entry
15
- from .azure_cognitive_search import CONNECTOR_TYPE as AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE
16
- from .azure_cognitive_search import azure_cognitive_search_destination_entry
17
+ from .azure_ai_search import CONNECTOR_TYPE as AZURE_AI_SEARCH_CONNECTOR_TYPE
18
+ from .azure_ai_search import azure_ai_search_destination_entry
17
19
  from .chroma import CONNECTOR_TYPE as CHROMA_CONNECTOR_TYPE
18
20
  from .chroma import chroma_destination_entry
21
+ from .confluence import CONNECTOR_TYPE as CONFLUENCE_CONNECTOR_TYPE
22
+ from .confluence import confluence_source_entry
19
23
  from .couchbase import CONNECTOR_TYPE as COUCHBASE_CONNECTOR_TYPE
20
24
  from .couchbase import couchbase_destination_entry, couchbase_source_entry
21
25
  from .delta_table import CONNECTOR_TYPE as DELTA_TABLE_CONNECTOR_TYPE
22
26
  from .delta_table import delta_table_destination_entry
23
27
  from .elasticsearch import CONNECTOR_TYPE as ELASTICSEARCH_CONNECTOR_TYPE
24
28
  from .elasticsearch import elasticsearch_destination_entry, elasticsearch_source_entry
29
+ from .gitlab import CONNECTOR_TYPE as GITLAB_CONNECTOR_TYPE
30
+ from .gitlab import gitlab_source_entry
25
31
  from .google_drive import CONNECTOR_TYPE as GOOGLE_DRIVE_CONNECTOR_TYPE
26
32
  from .google_drive import google_drive_source_entry
27
33
  from .kdbai import CONNECTOR_TYPE as KDBAI_CONNECTOR_TYPE
@@ -33,7 +39,7 @@ from .milvus import milvus_destination_entry
33
39
  from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
34
40
  from .mongodb import mongodb_destination_entry, mongodb_source_entry
35
41
  from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
36
- from .onedrive import onedrive_source_entry
42
+ from .onedrive import onedrive_destination_entry, onedrive_source_entry
37
43
  from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
38
44
  from .opensearch import opensearch_destination_entry, opensearch_source_entry
39
45
  from .outlook import CONNECTOR_TYPE as OUTLOOK_CONNECTOR_TYPE
@@ -72,6 +78,7 @@ add_source_entry(source_type=LOCAL_CONNECTOR_TYPE, entry=local_source_entry)
72
78
  add_destination_entry(destination_type=LOCAL_CONNECTOR_TYPE, entry=local_destination_entry)
73
79
 
74
80
  add_source_entry(source_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_source_entry)
81
+ add_destination_entry(destination_type=ONEDRIVE_CONNECTOR_TYPE, entry=onedrive_destination_entry)
75
82
 
76
83
  add_source_entry(source_type=OPENSEARCH_CONNECTOR_TYPE, entry=opensearch_source_entry)
77
84
  add_destination_entry(
@@ -90,8 +97,8 @@ add_source_entry(source_type=SHAREPOINT_CONNECTOR_TYPE, entry=sharepoint_source_
90
97
 
91
98
  add_destination_entry(destination_type=MILVUS_CONNECTOR_TYPE, entry=milvus_destination_entry)
92
99
  add_destination_entry(
93
- destination_type=AZURE_COGNTIVE_SEARCH_CONNECTOR_TYPE,
94
- entry=azure_cognitive_search_destination_entry,
100
+ destination_type=AZURE_AI_SEARCH_CONNECTOR_TYPE,
101
+ entry=azure_ai_search_destination_entry,
95
102
  )
96
103
 
97
104
  add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
@@ -99,4 +106,8 @@ add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entr
99
106
 
100
107
  add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)
101
108
 
109
+ add_source_entry(source_type=GITLAB_CONNECTOR_TYPE, entry=gitlab_source_entry)
110
+
102
111
  add_source_entry(source_type=SLACK_CONNECTOR_TYPE, entry=slack_source_entry)
112
+
113
+ add_source_entry(source_type=CONFLUENCE_CONNECTOR_TYPE, entry=confluence_source_entry)
@@ -12,11 +12,11 @@ from unstructured_ingest.v2.interfaces import (
12
12
  ConnectionConfig,
13
13
  Downloader,
14
14
  DownloaderConfig,
15
+ DownloadResponse,
15
16
  FileData,
16
17
  Indexer,
17
18
  IndexerConfig,
18
19
  SourceIdentifiers,
19
- download_responses,
20
20
  )
21
21
  from unstructured_ingest.v2.processes.connector_registry import (
22
22
  SourceRegistryEntry,
@@ -214,7 +214,7 @@ class AirtableDownloader(Downloader):
214
214
  row_dict.update(table_row["fields"])
215
215
  return row_dict
216
216
 
217
- def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
217
+ def run(self, file_data: FileData, **kwargs: Any) -> DownloadResponse:
218
218
  table_meta = AirtableTableMeta.model_validate(file_data.additional_metadata)
219
219
  table_contents = self.get_table_contents(table_meta=table_meta)
220
220
  df = pandas.DataFrame.from_dict(
@@ -19,6 +19,7 @@ from unstructured_ingest.error import (
19
19
  )
20
20
  from unstructured_ingest.utils.data_prep import batch_generator
21
21
  from unstructured_ingest.utils.dep_check import requires_dependencies
22
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
22
23
  from unstructured_ingest.v2.interfaces import (
23
24
  AccessConfig,
24
25
  ConnectionConfig,
@@ -129,11 +130,6 @@ class AstraDBIndexerConfig(IndexerConfig):
129
130
  "numbers, and underscores."
130
131
  )
131
132
  keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
132
- namespace: Optional[str] = Field(
133
- default=None,
134
- description="The Astra DB connection namespace.",
135
- deprecated="Please use 'keyspace' instead.",
136
- )
137
133
  batch_size: int = Field(default=20, description="Number of records per batch")
138
134
 
139
135
 
@@ -147,21 +143,17 @@ class AstraDBUploaderConfig(UploaderConfig):
147
143
  "Note that the collection name must only include letters, "
148
144
  "numbers, and underscores."
149
145
  )
150
- embedding_dimension: int = Field(
151
- default=384, description="The dimensionality of the embeddings"
152
- )
153
146
  keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
154
- namespace: Optional[str] = Field(
155
- default=None,
156
- description="The Astra DB connection namespace.",
157
- deprecated="Please use 'keyspace' instead.",
158
- )
159
147
  requested_indexing_policy: Optional[dict[str, Any]] = Field(
160
148
  default=None,
161
149
  description="The indexing policy to use for the collection.",
162
150
  examples=['{"deny": ["metadata"]}'],
163
151
  )
164
152
  batch_size: int = Field(default=20, description="Number of records per batch")
153
+ record_id_key: str = Field(
154
+ default=RECORD_ID_LABEL,
155
+ description="searchable key to find entries for the same record on previous runs",
156
+ )
165
157
 
166
158
 
167
159
  @dataclass
@@ -173,7 +165,7 @@ class AstraDBIndexer(Indexer):
173
165
  return get_astra_collection(
174
166
  connection_config=self.connection_config,
175
167
  collection_name=self.index_config.collection_name,
176
- keyspace=self.index_config.keyspace or self.index_config.namespace,
168
+ keyspace=self.index_config.keyspace,
177
169
  )
178
170
 
179
171
  def precheck(self) -> None:
@@ -223,7 +215,7 @@ class AstraDBIndexer(Indexer):
223
215
  additional_metadata={
224
216
  "ids": list(batch),
225
217
  "collection_name": self.index_config.collection_name,
226
- "keyspace": self.index_config.keyspace or self.index_config.namespace,
218
+ "keyspace": self.index_config.keyspace,
227
219
  },
228
220
  )
229
221
  yield fd
@@ -309,10 +301,11 @@ class AstraDBUploadStager(UploadStager):
309
301
  default_factory=lambda: AstraDBUploadStagerConfig()
310
302
  )
311
303
 
312
- def conform_dict(self, element_dict: dict) -> dict:
304
+ def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
313
305
  return {
314
306
  "$vector": element_dict.pop("embeddings", None),
315
307
  "content": element_dict.pop("text", None),
308
+ RECORD_ID_LABEL: file_data.identifier,
316
309
  "metadata": element_dict,
317
310
  }
318
311
 
@@ -328,10 +321,15 @@ class AstraDBUploadStager(UploadStager):
328
321
  elements_contents = json.load(elements_file)
329
322
  conformed_elements = []
330
323
  for element in elements_contents:
331
- conformed_elements.append(self.conform_dict(element_dict=element))
332
- output_path = Path(output_dir) / Path(f"{output_filename}.json")
324
+ conformed_elements.append(self.conform_dict(element_dict=element, file_data=file_data))
325
+ output_filename_path = Path(output_filename)
326
+ if output_filename_path.suffix == ".json":
327
+ output_path = Path(output_dir) / output_filename_path
328
+ else:
329
+ output_path = Path(output_dir) / output_filename_path.with_suffix(".json")
330
+ output_path.parent.mkdir(parents=True, exist_ok=True)
333
331
  with open(output_path, "w") as output_file:
334
- json.dump(conformed_elements, output_file)
332
+ json.dump(conformed_elements, output_file, indent=2)
335
333
  return output_path
336
334
 
337
335
 
@@ -346,7 +344,7 @@ class AstraDBUploader(Uploader):
346
344
  get_astra_collection(
347
345
  connection_config=self.connection_config,
348
346
  collection_name=self.upload_config.collection_name,
349
- keyspace=self.upload_config.keyspace or self.upload_config.namespace,
347
+ keyspace=self.upload_config.keyspace,
350
348
  )
351
349
  except Exception as e:
352
350
  logger.error(f"Failed to validate connection {e}", exc_info=True)
@@ -357,7 +355,19 @@ class AstraDBUploader(Uploader):
357
355
  return get_astra_collection(
358
356
  connection_config=self.connection_config,
359
357
  collection_name=self.upload_config.collection_name,
360
- keyspace=self.upload_config.keyspace or self.upload_config.namespace,
358
+ keyspace=self.upload_config.keyspace,
359
+ )
360
+
361
+ def delete_by_record_id(self, collection: "AstraDBCollection", file_data: FileData):
362
+ logger.debug(
363
+ f"deleting records from collection {collection.name} "
364
+ f"with {self.upload_config.record_id_key} "
365
+ f"set to {file_data.identifier}"
366
+ )
367
+ delete_filter = {self.upload_config.record_id_key: {"$eq": file_data.identifier}}
368
+ delete_resp = collection.delete_many(filter=delete_filter)
369
+ logger.debug(
370
+ f"deleted {delete_resp.deleted_count} records from collection {collection.name}"
361
371
  )
362
372
 
363
373
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
@@ -371,6 +381,8 @@ class AstraDBUploader(Uploader):
371
381
  astra_db_batch_size = self.upload_config.batch_size
372
382
  collection = self.get_collection()
373
383
 
384
+ self.delete_by_record_id(collection=collection, file_data=file_data)
385
+
374
386
  for chunk in batch_generator(elements_dict, astra_db_batch_size):
375
387
  collection.insert_many(chunk)
376
388
 
@@ -9,6 +9,7 @@ from pydantic import Field, Secret
9
9
  from unstructured_ingest.error import DestinationConnectionError, WriteError
10
10
  from unstructured_ingest.utils.data_prep import batch_generator
11
11
  from unstructured_ingest.utils.dep_check import requires_dependencies
12
+ from unstructured_ingest.v2.constants import RECORD_ID_LABEL
12
13
  from unstructured_ingest.v2.interfaces import (
13
14
  AccessConfig,
14
15
  ConnectionConfig,
@@ -26,18 +27,18 @@ from unstructured_ingest.v2.processes.connectors.utils import parse_datetime
26
27
 
27
28
  if TYPE_CHECKING:
28
29
  from azure.search.documents import SearchClient
30
+ from azure.search.documents.indexes import SearchIndexClient
29
31
 
32
+ CONNECTOR_TYPE = "azure_ai_search"
30
33
 
31
- CONNECTOR_TYPE = "azure_cognitive_search"
32
34
 
33
-
34
- class AzureCognitiveSearchAccessConfig(AccessConfig):
35
- azure_cognitive_search_key: str = Field(
35
+ class AzureAISearchAccessConfig(AccessConfig):
36
+ azure_ai_search_key: str = Field(
36
37
  alias="key", description="Credential that is used for authenticating to an Azure service"
37
38
  )
38
39
 
39
40
 
40
- class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
41
+ class AzureAISearchConnectionConfig(ConnectionConfig):
41
42
  endpoint: str = Field(
42
43
  description="The URL endpoint of an Azure AI (Cognitive) search service. "
43
44
  "In the form of https://{{service_name}}.search.windows.net"
@@ -45,10 +46,10 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
45
46
  index: str = Field(
46
47
  description="The name of the Azure AI (Cognitive) Search index to connect to."
47
48
  )
48
- access_config: Secret[AzureCognitiveSearchAccessConfig]
49
+ access_config: Secret[AzureAISearchAccessConfig]
49
50
 
50
- @requires_dependencies(["azure.search", "azure.core"], extras="azure-cognitive-search")
51
- def generate_client(self) -> "SearchClient":
51
+ @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
52
+ def get_search_client(self) -> "SearchClient":
52
53
  from azure.core.credentials import AzureKeyCredential
53
54
  from azure.search.documents import SearchClient
54
55
 
@@ -56,27 +57,43 @@ class AzureCognitiveSearchConnectionConfig(ConnectionConfig):
56
57
  endpoint=self.endpoint,
57
58
  index_name=self.index,
58
59
  credential=AzureKeyCredential(
59
- self.access_config.get_secret_value().azure_cognitive_search_key
60
+ self.access_config.get_secret_value().azure_ai_search_key
61
+ ),
62
+ )
63
+
64
+ @requires_dependencies(["azure.search", "azure.core"], extras="azure-ai-search")
65
+ def get_search_index_client(self) -> "SearchIndexClient":
66
+ from azure.core.credentials import AzureKeyCredential
67
+ from azure.search.documents.indexes import SearchIndexClient
68
+
69
+ return SearchIndexClient(
70
+ endpoint=self.endpoint,
71
+ credential=AzureKeyCredential(
72
+ self.access_config.get_secret_value().azure_ai_search_key
60
73
  ),
61
74
  )
62
75
 
63
76
 
64
- class AzureCognitiveSearchUploadStagerConfig(UploadStagerConfig):
77
+ class AzureAISearchUploadStagerConfig(UploadStagerConfig):
65
78
  pass
66
79
 
67
80
 
68
- class AzureCognitiveSearchUploaderConfig(UploaderConfig):
81
+ class AzureAISearchUploaderConfig(UploaderConfig):
69
82
  batch_size: int = Field(default=100, description="Number of records per batch")
83
+ record_id_key: str = Field(
84
+ default=RECORD_ID_LABEL,
85
+ description="searchable key to find entries for the same record on previous runs",
86
+ )
70
87
 
71
88
 
72
89
  @dataclass
73
- class AzureCognitiveSearchUploadStager(UploadStager):
74
- upload_stager_config: AzureCognitiveSearchUploadStagerConfig = field(
75
- default_factory=lambda: AzureCognitiveSearchUploadStagerConfig()
90
+ class AzureAISearchUploadStager(UploadStager):
91
+ upload_stager_config: AzureAISearchUploadStagerConfig = field(
92
+ default_factory=lambda: AzureAISearchUploadStagerConfig()
76
93
  )
77
94
 
78
95
  @staticmethod
79
- def conform_dict(data: dict) -> dict:
96
+ def conform_dict(data: dict, file_data: FileData) -> dict:
80
97
  """
81
98
  updates the dictionary that is from each Element being converted into a dict/json
82
99
  into a dictionary that conforms to the schema expected by the
@@ -84,6 +101,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
84
101
  """
85
102
 
86
103
  data["id"] = str(uuid.uuid4())
104
+ data[RECORD_ID_LABEL] = file_data.identifier
87
105
 
88
106
  if points := data.get("metadata", {}).get("coordinates", {}).get("points"):
89
107
  data["metadata"]["coordinates"]["points"] = json.dumps(points)
@@ -124,6 +142,7 @@ class AzureCognitiveSearchUploadStager(UploadStager):
124
142
 
125
143
  def run(
126
144
  self,
145
+ file_data: FileData,
127
146
  elements_filepath: Path,
128
147
  output_dir: Path,
129
148
  output_filename: str,
@@ -132,23 +151,59 @@ class AzureCognitiveSearchUploadStager(UploadStager):
132
151
  with open(elements_filepath) as elements_file:
133
152
  elements_contents = json.load(elements_file)
134
153
 
135
- conformed_elements = [self.conform_dict(data=element) for element in elements_contents]
154
+ conformed_elements = [
155
+ self.conform_dict(data=element, file_data=file_data) for element in elements_contents
156
+ ]
136
157
 
137
158
  output_path = Path(output_dir) / Path(f"{output_filename}.json")
159
+ output_path.parent.mkdir(parents=True, exist_ok=True)
138
160
  with open(output_path, "w") as output_file:
139
- json.dump(conformed_elements, output_file)
161
+ json.dump(conformed_elements, output_file, indent=2)
140
162
  return output_path
141
163
 
142
164
 
143
165
  @dataclass
144
- class AzureCognitiveSearchUploader(Uploader):
145
- upload_config: AzureCognitiveSearchUploaderConfig
146
- connection_config: AzureCognitiveSearchConnectionConfig
166
+ class AzureAISearchUploader(Uploader):
167
+ upload_config: AzureAISearchUploaderConfig
168
+ connection_config: AzureAISearchConnectionConfig
147
169
  connector_type: str = CONNECTOR_TYPE
148
170
 
171
+ def query_docs(self, record_id: str, index_key: str) -> list[str]:
172
+ client = self.connection_config.get_search_client()
173
+ results = list(client.search(filter=f"record_id eq '{record_id}'", select=[index_key]))
174
+ return [result[index_key] for result in results]
175
+
176
+ def delete_by_record_id(self, file_data: FileData, index_key: str) -> None:
177
+ logger.debug(
178
+ f"deleting any content with metadata "
179
+ f"{self.upload_config.record_id_key}={file_data.identifier} "
180
+ f"from azure cognitive search index: {self.connection_config.index}"
181
+ )
182
+ doc_ids_to_delete = self.query_docs(record_id=file_data.identifier, index_key=index_key)
183
+ if not doc_ids_to_delete:
184
+ return
185
+ client: SearchClient = self.connection_config.get_search_client()
186
+ results = client.delete_documents(
187
+ documents=[{index_key: doc_id} for doc_id in doc_ids_to_delete]
188
+ )
189
+ errors = []
190
+ success = []
191
+ for result in results:
192
+ if result.succeeded:
193
+ success.append(result)
194
+ else:
195
+ errors.append(result)
196
+ logger.debug(f"results: {len(success)} successes, {len(errors)} failures")
197
+ if errors:
198
+ raise WriteError(
199
+ ", ".join(
200
+ [f"[{error.status_code}] {error.error_message}" for error in errors],
201
+ ),
202
+ )
203
+
149
204
  @DestinationConnectionError.wrap
150
- @requires_dependencies(["azure"], extras="azure-cognitive-search")
151
- def write_dict(self, *args, elements_dict: list[dict[str, Any]], **kwargs) -> None:
205
+ @requires_dependencies(["azure"], extras="azure-ai-search")
206
+ def write_dict(self, elements_dict: list[dict[str, Any]]) -> None:
152
207
  import azure.core.exceptions
153
208
 
154
209
  logger.info(
@@ -156,7 +211,7 @@ class AzureCognitiveSearchUploader(Uploader):
156
211
  f"index at {self.connection_config.index}",
157
212
  )
158
213
  try:
159
- results = self.connection_config.generate_client().upload_documents(
214
+ results = self.connection_config.get_search_client().upload_documents(
160
215
  documents=elements_dict
161
216
  )
162
217
 
@@ -174,24 +229,42 @@ class AzureCognitiveSearchUploader(Uploader):
174
229
  raise WriteError(
175
230
  ", ".join(
176
231
  [
177
- f"{error.azure_cognitive_search_key}: "
232
+ f"{error.azure_ai_search_key}: "
178
233
  f"[{error.status_code}] {error.error_message}"
179
234
  for error in errors
180
235
  ],
181
236
  ),
182
237
  )
183
238
 
239
+ def can_delete(self) -> bool:
240
+ search_index_client = self.connection_config.get_search_index_client()
241
+ index = search_index_client.get_index(name=self.connection_config.index)
242
+ index_fields = index.fields
243
+ record_id_fields = [
244
+ field for field in index_fields if field.name == self.upload_config.record_id_key
245
+ ]
246
+ if not record_id_fields:
247
+ return False
248
+ record_id_field = record_id_fields[0]
249
+ return record_id_field.filterable
250
+
251
+ def get_index_key(self) -> str:
252
+ search_index_client = self.connection_config.get_search_index_client()
253
+ index = search_index_client.get_index(name=self.connection_config.index)
254
+ index_fields = index.fields
255
+ key_fields = [field for field in index_fields if field.key]
256
+ if not key_fields:
257
+ raise ValueError("no key field found in index fields")
258
+ return key_fields[0].name
259
+
184
260
  def precheck(self) -> None:
185
261
  try:
186
- client = self.connection_config.generate_client()
262
+ client = self.connection_config.get_search_client()
187
263
  client.get_document_count()
188
264
  except Exception as e:
189
265
  logger.error(f"failed to validate connection: {e}", exc_info=True)
190
266
  raise DestinationConnectionError(f"failed to validate connection: {e}")
191
267
 
192
- def write_dict_wrapper(self, elements_dict):
193
- return self.write_dict(elements_dict=elements_dict)
194
-
195
268
  def run(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
196
269
  with path.open("r") as file:
197
270
  elements_dict = json.load(file)
@@ -201,17 +274,21 @@ class AzureCognitiveSearchUploader(Uploader):
201
274
  f" index at {str(self.connection_config.index)}"
202
275
  f" with batch size {str(self.upload_config.batch_size)}"
203
276
  )
277
+ if self.can_delete():
278
+ index_key = self.get_index_key()
279
+ self.delete_by_record_id(file_data=file_data, index_key=index_key)
280
+ else:
281
+ logger.warning("criteria for deleting previous content not met, skipping")
204
282
 
205
283
  batch_size = self.upload_config.batch_size
206
-
207
284
  for chunk in batch_generator(elements_dict, batch_size):
208
285
  self.write_dict(elements_dict=chunk) # noqa: E203
209
286
 
210
287
 
211
- azure_cognitive_search_destination_entry = DestinationRegistryEntry(
212
- connection_config=AzureCognitiveSearchConnectionConfig,
213
- uploader=AzureCognitiveSearchUploader,
214
- uploader_config=AzureCognitiveSearchUploaderConfig,
215
- upload_stager=AzureCognitiveSearchUploadStager,
216
- upload_stager_config=AzureCognitiveSearchUploadStagerConfig,
288
+ azure_ai_search_destination_entry = DestinationRegistryEntry(
289
+ connection_config=AzureAISearchConnectionConfig,
290
+ uploader=AzureAISearchUploader,
291
+ uploader_config=AzureAISearchUploaderConfig,
292
+ upload_stager=AzureAISearchUploadStager,
293
+ upload_stager_config=AzureAISearchUploadStagerConfig,
217
294
  )
@@ -0,0 +1,195 @@
1
+ from dataclasses import dataclass, field
2
+ from pathlib import Path
3
+ from typing import TYPE_CHECKING, Generator, List, Optional
4
+
5
+ from pydantic import Field, Secret
6
+
7
+ from unstructured_ingest.error import SourceConnectionError
8
+ from unstructured_ingest.utils.dep_check import requires_dependencies
9
+ from unstructured_ingest.v2.interfaces import (
10
+ AccessConfig,
11
+ ConnectionConfig,
12
+ Downloader,
13
+ DownloaderConfig,
14
+ DownloadResponse,
15
+ FileData,
16
+ FileDataSourceMetadata,
17
+ Indexer,
18
+ IndexerConfig,
19
+ SourceIdentifiers,
20
+ )
21
+ from unstructured_ingest.v2.logger import logger
22
+ from unstructured_ingest.v2.processes.connector_registry import (
23
+ SourceRegistryEntry,
24
+ )
25
+
26
+ if TYPE_CHECKING:
27
+ from atlassian import Confluence
28
+
29
+ CONNECTOR_TYPE = "confluence"
30
+
31
+
32
+ class ConfluenceAccessConfig(AccessConfig):
33
+ api_token: str = Field(description="Confluence API token")
34
+
35
+
36
+ class ConfluenceConnectionConfig(ConnectionConfig):
37
+ url: str = Field(description="URL of the Confluence instance")
38
+ user_email: str = Field(description="User email for authentication")
39
+ access_config: Secret[ConfluenceAccessConfig] = Field(
40
+ description="Access configuration for Confluence"
41
+ )
42
+
43
+ @requires_dependencies(["atlassian"], extras="confluence")
44
+ def get_client(self) -> "Confluence":
45
+ from atlassian import Confluence
46
+
47
+ access_configs = self.access_config.get_secret_value()
48
+ return Confluence(
49
+ url=self.url,
50
+ username=self.user_email,
51
+ password=access_configs.api_token,
52
+ )
53
+
54
+
55
+ class ConfluenceIndexerConfig(IndexerConfig):
56
+ max_num_of_spaces: int = Field(500, description="Maximum number of spaces to index")
57
+ max_num_of_docs_from_each_space: int = Field(
58
+ 100, description="Maximum number of documents to fetch from each space"
59
+ )
60
+ spaces: Optional[List[str]] = Field(None, description="List of specific space keys to index")
61
+
62
+
63
+ @dataclass
64
+ class ConfluenceIndexer(Indexer):
65
+ connection_config: ConfluenceConnectionConfig
66
+ index_config: ConfluenceIndexerConfig
67
+ connector_type: str = CONNECTOR_TYPE
68
+
69
+ def precheck(self) -> bool:
70
+ try:
71
+
72
+ # Attempt to retrieve a list of spaces with limit=1.
73
+ # This should only succeed if all creds are valid
74
+ client = self.connection_config.get_client()
75
+ client.get_all_spaces(limit=1)
76
+ logger.info("Connection to Confluence successful.")
77
+ return True
78
+ except Exception as e:
79
+ logger.error(f"Failed to connect to Confluence: {e}", exc_info=True)
80
+ raise SourceConnectionError(f"Failed to connect to Confluence: {e}")
81
+
82
+ def _get_space_ids(self) -> List[str]:
83
+ spaces = self.index_config.spaces
84
+ if spaces:
85
+ return spaces
86
+ else:
87
+ client = self.connection_config.get_client()
88
+ all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
89
+ space_ids = [space["key"] for space in all_spaces["results"]]
90
+ return space_ids
91
+
92
+ def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
93
+ client = self.connection_config.get_client()
94
+ pages = client.get_all_pages_from_space(
95
+ space=space_id,
96
+ start=0,
97
+ limit=self.index_config.max_num_of_docs_from_each_space,
98
+ expand=None,
99
+ content_type="page",
100
+ status=None,
101
+ )
102
+ doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
103
+ return doc_ids
104
+
105
+ def run(self) -> Generator[FileData, None, None]:
106
+ from time import time
107
+
108
+ space_ids = self._get_space_ids()
109
+ for space_id in space_ids:
110
+ doc_ids = self._get_docs_ids_within_one_space(space_id)
111
+ for doc in doc_ids:
112
+ doc_id = doc["doc_id"]
113
+ # Build metadata
114
+ metadata = FileDataSourceMetadata(
115
+ date_processed=str(time()),
116
+ url=f"{self.connection_config.url}/pages/{doc_id}",
117
+ record_locator={
118
+ "space_id": space_id,
119
+ "document_id": doc_id,
120
+ },
121
+ )
122
+ additional_metadata = {
123
+ "space_id": space_id,
124
+ "document_id": doc_id,
125
+ }
126
+
127
+ # Construct relative path and filename
128
+ filename = f"{doc_id}.html"
129
+ relative_path = str(Path(space_id) / filename)
130
+
131
+ source_identifiers = SourceIdentifiers(
132
+ filename=filename,
133
+ fullpath=relative_path,
134
+ rel_path=relative_path,
135
+ )
136
+
137
+ file_data = FileData(
138
+ identifier=doc_id,
139
+ connector_type=self.connector_type,
140
+ metadata=metadata,
141
+ additional_metadata=additional_metadata,
142
+ source_identifiers=source_identifiers,
143
+ )
144
+ yield file_data
145
+
146
+
147
+ class ConfluenceDownloaderConfig(DownloaderConfig):
148
+ pass
149
+
150
+
151
+ @dataclass
152
+ class ConfluenceDownloader(Downloader):
153
+ connection_config: ConfluenceConnectionConfig
154
+ download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
155
+ connector_type: str = CONNECTOR_TYPE
156
+
157
+ def run(self, file_data: FileData, **kwargs) -> DownloadResponse:
158
+ doc_id = file_data.identifier
159
+ try:
160
+ client = self.connection_config.get_client()
161
+ page = client.get_page_by_id(
162
+ page_id=doc_id,
163
+ expand="history.lastUpdated,version,body.view",
164
+ )
165
+ except Exception as e:
166
+ logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
167
+ raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
168
+
169
+ if not page:
170
+ raise ValueError(f"Page with ID {doc_id} does not exist.")
171
+
172
+ content = page["body"]["view"]["value"]
173
+
174
+ filepath = file_data.source_identifiers.relative_path
175
+ download_path = Path(self.download_dir) / filepath
176
+ download_path.parent.mkdir(parents=True, exist_ok=True)
177
+ with open(download_path, "w", encoding="utf8") as f:
178
+ f.write(content)
179
+
180
+ # Update file_data with metadata
181
+ file_data.metadata.date_created = page["history"]["createdDate"]
182
+ file_data.metadata.date_modified = page["version"]["when"]
183
+ file_data.metadata.version = str(page["version"]["number"])
184
+ file_data.display_name = page["title"]
185
+
186
+ return self.generate_download_response(file_data=file_data, download_path=download_path)
187
+
188
+
189
+ confluence_source_entry = SourceRegistryEntry(
190
+ connection_config=ConfluenceConnectionConfig,
191
+ indexer_config=ConfluenceIndexerConfig,
192
+ indexer=ConfluenceIndexer,
193
+ downloader_config=ConfluenceDownloaderConfig,
194
+ downloader=ConfluenceDownloader,
195
+ )