unstructured-ingest 0.5.10__py3-none-any.whl → 0.5.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

@@ -31,6 +31,7 @@ from unstructured_ingest.v2.processes.connectors.astradb import (
31
31
  AstraDBUploader,
32
32
  AstraDBUploaderConfig,
33
33
  AstraDBUploadStager,
34
+ AstraDBUploadStagerConfig,
34
35
  DestinationConnectionError,
35
36
  SourceConnectionError,
36
37
  )
@@ -258,3 +259,23 @@ def test_astra_stager(
258
259
  stager=stager,
259
260
  tmp_dir=tmp_path,
260
261
  )
262
+
263
+
264
+ @pytest.mark.tags(CONNECTOR_TYPE, DESTINATION_TAG, VECTOR_DB_TAG)
265
+ @pytest.mark.parametrize("upload_file_str", ["upload_file_ndjson", "upload_file"])
266
+ def test_astra_stager_flatten_metadata(
267
+ request: TopRequest,
268
+ upload_file_str: str,
269
+ tmp_path: Path,
270
+ ):
271
+ stager_config = AstraDBUploadStagerConfig(flatten_metadata=True)
272
+ upload_file: Path = request.getfixturevalue(upload_file_str)
273
+ stager = AstraDBUploadStager(upload_stager_config=stager_config)
274
+ stager_validation(
275
+ configs=StagerValidationConfigs(
276
+ test_id=CONNECTOR_TYPE, expected_count=22, expected_folder="stager_flatten_metadata"
277
+ ),
278
+ input_file=upload_file,
279
+ stager=stager,
280
+ tmp_dir=tmp_path,
281
+ )
@@ -1,3 +1,19 @@
1
+ # add this back in when figure out why it's failing since NOTHING changed when it started failing
2
+
3
+ # ==================================== ERRORS ====================================
4
+ # _________ ERROR collecting test/integration/connectors/test_chroma.py __________
5
+ # ImportError while importing test module '/home/runner/work/unstructured-ingest/
6
+ # unstructured-ingest/test/integration/connectors/test_chroma.py'.
7
+ # Hint: make sure your test modules/packages have valid Python names.
8
+ # Traceback:
9
+ # /opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/importlib/__init__.py:126: in import_module
10
+ # return _bootstrap._gcd_import(name[level:], package, level)
11
+ # test/integration/connectors/test_chroma.py:4: in <module>
12
+ # import chromadb
13
+ # E ModuleNotFoundError: No module named 'chromadb'
14
+
15
+
16
+ """
1
17
  import json
2
18
  from pathlib import Path
3
19
 
@@ -116,3 +132,5 @@ def test_chroma_stager(
116
132
  stager=stager,
117
133
  tmp_dir=tmp_path,
118
134
  )
135
+
136
+ """
@@ -0,0 +1,142 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ import pytest
6
+
7
+ from test.integration.connectors.utils.constants import SOURCE_TAG, UNCATEGORIZED_TAG
8
+ from test.integration.connectors.utils.validation.source import (
9
+ SourceValidationConfigs,
10
+ source_connector_validation,
11
+ )
12
+ from test.integration.utils import requires_env
13
+ from unstructured_ingest.v2.errors import UserAuthError
14
+ from unstructured_ingest.v2.processes.connectors.zendesk import (
15
+ CONNECTOR_TYPE,
16
+ ZendeskAccessConfig,
17
+ ZendeskConnectionConfig,
18
+ ZendeskDownloader,
19
+ ZendeskDownloaderConfig,
20
+ ZendeskIndexer,
21
+ ZendeskIndexerConfig,
22
+ )
23
+
24
+
25
+ async def zendesk_source_test(
26
+ tmp_path: Path,
27
+ token: Optional[str] = None,
28
+ email: Optional[str] = None,
29
+ subdomain: Optional[str] = None,
30
+ ):
31
+
32
+ access_config = ZendeskAccessConfig(api_token=token)
33
+ connection_config = ZendeskConnectionConfig(
34
+ subdomain=subdomain, email=email, access_config=access_config
35
+ )
36
+
37
+ index_config = ZendeskIndexerConfig(batch_size=2, item_type="tickets")
38
+
39
+ indexer = ZendeskIndexer(
40
+ connection_config=connection_config,
41
+ index_config=index_config,
42
+ connector_type=CONNECTOR_TYPE,
43
+ )
44
+
45
+ # handle downloader.
46
+ download_config = ZendeskDownloaderConfig(download_dir=tmp_path)
47
+
48
+ downloader = ZendeskDownloader(
49
+ connection_config=connection_config,
50
+ download_config=download_config,
51
+ connector_type=CONNECTOR_TYPE,
52
+ )
53
+
54
+ # Run the source connector validation
55
+ await source_connector_validation(
56
+ indexer=indexer,
57
+ downloader=downloader,
58
+ configs=SourceValidationConfigs(
59
+ test_id="zendesk-tickets",
60
+ expected_num_files=4,
61
+ validate_file_data=False,
62
+ validate_downloaded_files=True,
63
+ ),
64
+ )
65
+
66
+
67
+ async def zendesk_source_articles_test(
68
+ tmp_path: Path,
69
+ token: Optional[str] = None,
70
+ email: Optional[str] = None,
71
+ subdomain: Optional[str] = None,
72
+ ):
73
+
74
+ access_config = ZendeskAccessConfig(api_token=token)
75
+ connection_config = ZendeskConnectionConfig(
76
+ subdomain=subdomain, email=email, access_config=access_config
77
+ )
78
+
79
+ index_config = ZendeskIndexerConfig(batch_size=2, item_type="articles")
80
+
81
+ indexer = ZendeskIndexer(
82
+ connection_config=connection_config,
83
+ index_config=index_config,
84
+ connector_type=CONNECTOR_TYPE,
85
+ )
86
+
87
+ # handle downloader.
88
+ download_config = ZendeskDownloaderConfig(download_dir=tmp_path, extract_images=True)
89
+
90
+ downloader = ZendeskDownloader(
91
+ connection_config=connection_config,
92
+ download_config=download_config,
93
+ connector_type=CONNECTOR_TYPE,
94
+ )
95
+
96
+ # Run the source connector validation
97
+ await source_connector_validation(
98
+ indexer=indexer,
99
+ downloader=downloader,
100
+ configs=SourceValidationConfigs(
101
+ test_id="zendesk-articles",
102
+ expected_num_files=4,
103
+ validate_file_data=False,
104
+ validate_downloaded_files=True,
105
+ ),
106
+ )
107
+
108
+
109
+ @pytest.mark.asyncio
110
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
111
+ @requires_env("ZENDESK_TOKEN")
112
+ async def test_zendesk_source(temp_dir):
113
+ await zendesk_source_test(
114
+ tmp_path=temp_dir,
115
+ token=os.environ["ZENDESK_TOKEN"],
116
+ email="test@unstructured.io",
117
+ subdomain="unstructuredhelp",
118
+ )
119
+
120
+
121
+ @pytest.mark.asyncio
122
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
123
+ @requires_env("ZENDESK_TOKEN")
124
+ async def test_zendesk_source_articles(temp_dir):
125
+ await zendesk_source_articles_test(
126
+ tmp_path=temp_dir,
127
+ token=os.environ["ZENDESK_TOKEN"],
128
+ email="test@unstructured.io",
129
+ subdomain="unstructuredhelp",
130
+ )
131
+
132
+
133
+ @pytest.mark.asyncio
134
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
135
+ async def test_zendesk_source_articles_fail(temp_dir):
136
+ with pytest.raises(expected_exception=UserAuthError):
137
+ await zendesk_source_articles_test(
138
+ tmp_path=temp_dir,
139
+ token="FORCE_FAIL_TOKEN",
140
+ email="test@unstructured.io",
141
+ subdomain="unstructuredhelp",
142
+ )
@@ -9,9 +9,10 @@ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers, Uploa
9
9
 
10
10
  class StagerValidationConfigs(ValidationConfig):
11
11
  expected_count: int
12
+ expected_folder: str = "stager"
12
13
 
13
14
  def stager_output_dir(self) -> Path:
14
- dir = self.test_output_dir() / "stager"
15
+ dir = self.test_output_dir() / self.expected_folder
15
16
  dir.mkdir(exist_ok=True, parents=True)
16
17
  return dir
17
18
 
@@ -1 +1 @@
1
- __version__ = "0.5.10" # pragma: no cover
1
+ __version__ = "0.5.12" # pragma: no cover
@@ -1,5 +1,5 @@
1
1
  import os
2
- from abc import ABC, abstractmethod
2
+ from abc import ABC
3
3
  from pathlib import Path
4
4
  from typing import Any, Optional, TypedDict, TypeVar, Union
5
5
 
@@ -81,9 +81,8 @@ class Downloader(BaseProcess, BaseConnector, ABC):
81
81
  def is_async(self) -> bool:
82
82
  return True
83
83
 
84
- @abstractmethod
85
84
  def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
86
- pass
85
+ raise NotImplementedError()
87
86
 
88
87
  async def run_async(self, file_data: FileData, **kwargs: Any) -> download_responses:
89
88
  return self.run(file_data=file_data, **kwargs)
@@ -144,10 +144,6 @@ async def get_async_astra_collection(
144
144
  return async_astra_db_collection
145
145
 
146
146
 
147
- class AstraDBUploadStagerConfig(UploadStagerConfig):
148
- pass
149
-
150
-
151
147
  class AstraDBIndexerConfig(IndexerConfig):
152
148
  collection_name: str = Field(
153
149
  description="The name of the Astra DB collection. "
@@ -158,30 +154,6 @@ class AstraDBIndexerConfig(IndexerConfig):
158
154
  batch_size: int = Field(default=20, description="Number of records per batch")
159
155
 
160
156
 
161
- class AstraDBDownloaderConfig(DownloaderConfig):
162
- fields: list[str] = field(default_factory=list)
163
-
164
-
165
- class AstraDBUploaderConfig(UploaderConfig):
166
- collection_name: Optional[str] = Field(
167
- description="The name of the Astra DB collection. "
168
- "Note that the collection name must only include letters, "
169
- "numbers, and underscores.",
170
- default=None,
171
- )
172
- keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
173
- requested_indexing_policy: Optional[dict[str, Any]] = Field(
174
- default=None,
175
- description="The indexing policy to use for the collection.",
176
- examples=['{"deny": ["metadata"]}'],
177
- )
178
- batch_size: int = Field(default=20, description="Number of records per batch")
179
- record_id_key: str = Field(
180
- default=RECORD_ID_LABEL,
181
- description="searchable key to find entries for the same record on previous runs",
182
- )
183
-
184
-
185
157
  @dataclass
186
158
  class AstraDBIndexer(Indexer):
187
159
  connection_config: AstraDBConnectionConfig
@@ -239,6 +211,10 @@ class AstraDBIndexer(Indexer):
239
211
  yield fd
240
212
 
241
213
 
214
+ class AstraDBDownloaderConfig(DownloaderConfig):
215
+ fields: list[str] = field(default_factory=list)
216
+
217
+
242
218
  @dataclass
243
219
  class AstraDBDownloader(Downloader):
244
220
  connection_config: AstraDBConnectionConfig
@@ -315,6 +291,12 @@ class AstraDBDownloader(Downloader):
315
291
  return download_responses
316
292
 
317
293
 
294
+ class AstraDBUploadStagerConfig(UploadStagerConfig):
295
+ flatten_metadata: Optional[bool] = Field(
296
+ default=False, description="Move metadata to top level of the record."
297
+ )
298
+
299
+
318
300
  @dataclass
319
301
  class AstraDBUploadStager(UploadStager):
320
302
  upload_stager_config: AstraDBUploadStagerConfig = field(
@@ -336,6 +318,12 @@ class AstraDBUploadStager(UploadStager):
336
318
 
337
319
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
338
320
  self.truncate_dict_elements(element_dict)
321
+ if self.upload_stager_config.flatten_metadata:
322
+ # move metadata to top level so it isn't nested in metadata column
323
+ metadata = element_dict.pop("metadata", None)
324
+ if metadata:
325
+ element_dict.update(metadata)
326
+
339
327
  return {
340
328
  "$vector": element_dict.pop("embeddings", None),
341
329
  "content": element_dict.pop("text", None),
@@ -344,6 +332,26 @@ class AstraDBUploadStager(UploadStager):
344
332
  }
345
333
 
346
334
 
335
+ class AstraDBUploaderConfig(UploaderConfig):
336
+ collection_name: Optional[str] = Field(
337
+ description="The name of the Astra DB collection. "
338
+ "Note that the collection name must only include letters, "
339
+ "numbers, and underscores.",
340
+ default=None,
341
+ )
342
+ keyspace: Optional[str] = Field(default=None, description="The Astra DB connection keyspace.")
343
+ requested_indexing_policy: Optional[dict[str, Any]] = Field(
344
+ default=None,
345
+ description="The indexing policy to use for the collection.",
346
+ examples=['{"deny": ["metadata"]}'],
347
+ )
348
+ batch_size: int = Field(default=20, description="Number of records per batch")
349
+ record_id_key: str = Field(
350
+ default=RECORD_ID_LABEL,
351
+ description="searchable key to find entries for the same record on previous runs",
352
+ )
353
+
354
+
347
355
  @dataclass
348
356
  class AstraDBUploader(Uploader):
349
357
  connection_config: AstraDBConnectionConfig
@@ -111,6 +111,28 @@ class Neo4jUploadStager(UploadStager):
111
111
 
112
112
  return output_filepath
113
113
 
114
+ def _add_entities(self, element: dict, graph: "Graph", element_node: _Node) -> None:
115
+ entities = element.get("metadata", {}).get("entities", [])
116
+ if not entities:
117
+ return None
118
+ if not isinstance(entities, list):
119
+ return None
120
+
121
+ for entity in entities:
122
+ if not isinstance(entity, dict):
123
+ continue
124
+ if "entity" not in entity or "type" not in entity:
125
+ continue
126
+ entity_node = _Node(
127
+ labels=[Label.ENTITY], properties={"id": entity["entity"]}, id_=entity["entity"]
128
+ )
129
+ graph.add_edge(
130
+ entity_node,
131
+ _Node(labels=[Label.ENTITY], properties={"id": entity["type"]}, id_=entity["type"]),
132
+ relationship=Relationship.ENTITY_TYPE,
133
+ )
134
+ graph.add_edge(element_node, entity_node, relationship=Relationship.HAS_ENTITY)
135
+
114
136
  def _create_lexical_graph(self, elements: list[dict], document_node: _Node) -> "Graph":
115
137
  import networkx as nx
116
138
 
@@ -129,25 +151,23 @@ class Neo4jUploadStager(UploadStager):
129
151
  previous_node = element_node
130
152
  graph.add_edge(element_node, document_node, relationship=Relationship.PART_OF_DOCUMENT)
131
153
 
154
+ self._add_entities(element, graph, element_node)
155
+
132
156
  if self._is_chunk(element):
133
- origin_element_nodes = [
134
- self._create_element_node(origin_element)
135
- for origin_element in format_and_truncate_orig_elements(element)
136
- ]
137
- graph.add_edges_from(
138
- [
139
- (origin_element_node, element_node)
140
- for origin_element_node in origin_element_nodes
141
- ],
142
- relationship=Relationship.PART_OF_CHUNK,
143
- )
144
- graph.add_edges_from(
145
- [
146
- (origin_element_node, document_node)
147
- for origin_element_node in origin_element_nodes
148
- ],
149
- relationship=Relationship.PART_OF_DOCUMENT,
150
- )
157
+ for origin_element in format_and_truncate_orig_elements(element):
158
+ origin_element_node = self._create_element_node(origin_element)
159
+
160
+ graph.add_edge(
161
+ origin_element_node,
162
+ element_node,
163
+ relationship=Relationship.PART_OF_CHUNK,
164
+ )
165
+ graph.add_edge(
166
+ origin_element_node,
167
+ document_node,
168
+ relationship=Relationship.PART_OF_DOCUMENT,
169
+ )
170
+ self._add_entities(origin_element, graph, origin_element_node)
151
171
 
152
172
  return graph
153
173
 
@@ -231,6 +251,7 @@ class Label(Enum):
231
251
  UNSTRUCTURED_ELEMENT = "UnstructuredElement"
232
252
  CHUNK = "Chunk"
233
253
  DOCUMENT = "Document"
254
+ ENTITY = "Entity"
234
255
 
235
256
 
236
257
  class Relationship(Enum):
@@ -238,6 +259,8 @@ class Relationship(Enum):
238
259
  PART_OF_CHUNK = "PART_OF_CHUNK"
239
260
  NEXT_CHUNK = "NEXT_CHUNK"
240
261
  NEXT_ELEMENT = "NEXT_ELEMENT"
262
+ ENTITY_TYPE = "ENTITY_TYPE"
263
+ HAS_ENTITY = "HAS_ENTITY"
241
264
 
242
265
 
243
266
  class Neo4jUploaderConfig(UploaderConfig):
@@ -0,0 +1,31 @@
1
+ from unstructured_ingest.v2.processes.connector_registry import (
2
+ add_source_entry,
3
+ )
4
+
5
+ from .zendesk import (
6
+ CONNECTOR_TYPE,
7
+ ZendeskAccessConfig,
8
+ ZendeskClient,
9
+ ZendeskConnectionConfig,
10
+ ZendeskDownloader,
11
+ ZendeskDownloaderConfig,
12
+ ZendeskIndexer,
13
+ ZendeskIndexerConfig,
14
+ ZendeskTicket,
15
+ zendesk_source_entry,
16
+ )
17
+
18
+ __all__ = [
19
+ "add_source_entry",
20
+ "zendesk_source_entry",
21
+ "ZendeskAccessConfig",
22
+ "ZendeskClient",
23
+ "ZendeskConnectionConfig",
24
+ "ZendeskDownloader",
25
+ "ZendeskDownloaderConfig",
26
+ "ZendeskIndexer",
27
+ "ZendeskIndexerConfig",
28
+ "ZendeskTicket",
29
+ ]
30
+
31
+ add_source_entry(source_type=CONNECTOR_TYPE, entry=zendesk_source_entry)