unstructured-ingest 0.5.14__py3-none-any.whl → 0.5.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (39) hide show
  1. test/integration/connectors/test_confluence.py +2 -2
  2. test/integration/connectors/test_zendesk.py +31 -53
  3. test/integration/connectors/utils/validation/source.py +5 -3
  4. test/unit/v2/connectors/test_confluence.py +35 -3
  5. unstructured_ingest/__version__.py +1 -1
  6. unstructured_ingest/embed/huggingface.py +3 -7
  7. unstructured_ingest/utils/data_prep.py +4 -2
  8. unstructured_ingest/v2/interfaces/file_data.py +1 -1
  9. unstructured_ingest/v2/interfaces/upload_stager.py +3 -6
  10. unstructured_ingest/v2/pipeline/pipeline.py +7 -0
  11. unstructured_ingest/v2/pipeline/steps/chunk.py +1 -1
  12. unstructured_ingest/v2/pipeline/steps/download.py +3 -3
  13. unstructured_ingest/v2/pipeline/steps/embed.py +1 -1
  14. unstructured_ingest/v2/pipeline/steps/index.py +4 -4
  15. unstructured_ingest/v2/pipeline/steps/partition.py +1 -1
  16. unstructured_ingest/v2/processes/connectors/confluence.py +20 -3
  17. unstructured_ingest/v2/processes/connectors/databricks/volumes_aws.py +6 -0
  18. unstructured_ingest/v2/processes/connectors/databricks/volumes_azure.py +6 -0
  19. unstructured_ingest/v2/processes/connectors/databricks/volumes_gcp.py +6 -0
  20. unstructured_ingest/v2/processes/connectors/databricks/volumes_native.py +6 -0
  21. unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +1 -1
  22. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +6 -0
  23. unstructured_ingest/v2/processes/connectors/fsspec/box.py +6 -0
  24. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +6 -0
  25. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +6 -0
  26. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +6 -0
  27. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +6 -0
  28. unstructured_ingest/v2/processes/connectors/local.py +8 -1
  29. unstructured_ingest/v2/processes/connectors/zendesk/client.py +221 -156
  30. unstructured_ingest/v2/processes/connectors/zendesk/zendesk.py +83 -274
  31. unstructured_ingest/v2/processes/embedder.py +3 -4
  32. unstructured_ingest/v2/processes/utils/__init__.py +0 -0
  33. unstructured_ingest/v2/processes/utils/blob_storage.py +31 -0
  34. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/METADATA +20 -20
  35. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/RECORD +39 -37
  36. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/LICENSE.md +0 -0
  37. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/WHEEL +0 -0
  38. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/entry_points.txt +0 -0
  39. {unstructured_ingest-0.5.14.dist-info → unstructured_ingest-0.5.16.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,7 @@ async def test_confluence_source(temp_dir):
30
30
  spaces = ["testteamsp", "MFS"]
31
31
 
32
32
  # Create connection and indexer configurations
33
- access_config = ConfluenceAccessConfig(password=api_token)
33
+ access_config = ConfluenceAccessConfig(api_token=api_token)
34
34
  connection_config = ConfluenceConnectionConfig(
35
35
  url=confluence_url,
36
36
  username=user_email,
@@ -77,7 +77,7 @@ async def test_confluence_source_large(temp_dir):
77
77
  spaces = ["testteamsp1"]
78
78
 
79
79
  # Create connection and indexer configurations
80
- access_config = ConfluenceAccessConfig(password=api_token)
80
+ access_config = ConfluenceAccessConfig(api_token=api_token)
81
81
  connection_config = ConfluenceConnectionConfig(
82
82
  url=confluence_url,
83
83
  username=user_email,
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  from pathlib import Path
3
- from typing import Optional
4
3
 
5
4
  import pytest
6
5
 
@@ -21,20 +20,20 @@ from unstructured_ingest.v2.processes.connectors.zendesk.zendesk import (
21
20
  ZendeskIndexerConfig,
22
21
  )
23
22
 
23
+ SUBDOMAIN = "unstructuredhelp"
24
+ EMAIL = "test@unstructured.io"
24
25
 
25
- async def zendesk_source_test(
26
- tmp_path: Path,
27
- token: Optional[str] = None,
28
- email: Optional[str] = None,
29
- subdomain: Optional[str] = None,
30
- ):
31
26
 
32
- access_config = ZendeskAccessConfig(api_token=token)
27
+ @pytest.mark.asyncio
28
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
29
+ @requires_env("ZENDESK_TOKEN")
30
+ async def test_zendesk_source_tickets(temp_dir: Path):
31
+ access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
33
32
  connection_config = ZendeskConnectionConfig(
34
- subdomain=subdomain, email=email, access_config=access_config
33
+ subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
35
34
  )
36
35
 
37
- index_config = ZendeskIndexerConfig(batch_size=2, item_type="tickets")
36
+ index_config = ZendeskIndexerConfig(item_type="tickets")
38
37
 
39
38
  indexer = ZendeskIndexer(
40
39
  connection_config=connection_config,
@@ -43,7 +42,7 @@ async def zendesk_source_test(
43
42
  )
44
43
 
45
44
  # handle downloader.
46
- download_config = ZendeskDownloaderConfig(download_dir=tmp_path)
45
+ download_config = ZendeskDownloaderConfig(download_dir=temp_dir)
47
46
 
48
47
  downloader = ZendeskDownloader(
49
48
  connection_config=connection_config,
@@ -57,26 +56,23 @@ async def zendesk_source_test(
57
56
  downloader=downloader,
58
57
  configs=SourceValidationConfigs(
59
58
  test_id="zendesk-tickets",
60
- expected_num_files=4,
59
+ expected_num_files=8,
61
60
  validate_file_data=False,
62
61
  validate_downloaded_files=True,
63
62
  ),
64
63
  )
65
64
 
66
65
 
67
- async def zendesk_source_articles_test(
68
- tmp_path: Path,
69
- token: Optional[str] = None,
70
- email: Optional[str] = None,
71
- subdomain: Optional[str] = None,
72
- ):
73
-
74
- access_config = ZendeskAccessConfig(api_token=token)
66
+ @pytest.mark.asyncio
67
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
68
+ @requires_env("ZENDESK_TOKEN")
69
+ async def test_zendesk_source_articles(temp_dir):
70
+ access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
75
71
  connection_config = ZendeskConnectionConfig(
76
- subdomain=subdomain, email=email, access_config=access_config
72
+ subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
77
73
  )
78
74
 
79
- index_config = ZendeskIndexerConfig(batch_size=2, item_type="articles")
75
+ index_config = ZendeskIndexerConfig(item_type="articles")
80
76
 
81
77
  indexer = ZendeskIndexer(
82
78
  connection_config=connection_config,
@@ -85,7 +81,7 @@ async def zendesk_source_articles_test(
85
81
  )
86
82
 
87
83
  # handle downloader.
88
- download_config = ZendeskDownloaderConfig(download_dir=tmp_path, extract_images=True)
84
+ download_config = ZendeskDownloaderConfig(download_dir=temp_dir, extract_images=True)
89
85
 
90
86
  downloader = ZendeskDownloader(
91
87
  connection_config=connection_config,
@@ -99,44 +95,26 @@ async def zendesk_source_articles_test(
99
95
  downloader=downloader,
100
96
  configs=SourceValidationConfigs(
101
97
  test_id="zendesk-articles",
102
- expected_num_files=4,
103
- validate_file_data=False,
98
+ expected_num_files=8,
99
+ validate_file_data=True,
104
100
  validate_downloaded_files=True,
105
101
  ),
106
102
  )
107
103
 
108
104
 
109
- @pytest.mark.asyncio
110
105
  @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
111
- @requires_env("ZENDESK_TOKEN")
112
- async def test_zendesk_source(temp_dir):
113
- await zendesk_source_test(
114
- tmp_path=temp_dir,
115
- token=os.environ["ZENDESK_TOKEN"],
116
- email="test@unstructured.io",
117
- subdomain="unstructuredhelp",
106
+ def test_zendesk_source_articles_fail(temp_dir):
107
+ access_config = ZendeskAccessConfig(api_token="FAKE_TOKEN")
108
+ connection_config = ZendeskConnectionConfig(
109
+ subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
118
110
  )
119
111
 
112
+ index_config = ZendeskIndexerConfig(item_type="tickets")
120
113
 
121
- @pytest.mark.asyncio
122
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
123
- @requires_env("ZENDESK_TOKEN")
124
- async def test_zendesk_source_articles(temp_dir):
125
- await zendesk_source_articles_test(
126
- tmp_path=temp_dir,
127
- token=os.environ["ZENDESK_TOKEN"],
128
- email="test@unstructured.io",
129
- subdomain="unstructuredhelp",
114
+ indexer = ZendeskIndexer(
115
+ connection_config=connection_config,
116
+ index_config=index_config,
117
+ connector_type=CONNECTOR_TYPE,
130
118
  )
131
-
132
-
133
- @pytest.mark.asyncio
134
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
135
- async def test_zendesk_source_articles_fail(temp_dir):
136
119
  with pytest.raises(expected_exception=UserAuthError):
137
- await zendesk_source_articles_test(
138
- tmp_path=temp_dir,
139
- token="FORCE_FAIL_TOKEN",
140
- email="test@unstructured.io",
141
- subdomain="unstructuredhelp",
142
- )
120
+ indexer.precheck()
@@ -103,7 +103,7 @@ def check_contents(
103
103
  file_data_path = expected_output_dir / f"{file_data.identifier}.json"
104
104
  with file_data_path.open("r") as file:
105
105
  expected_file_data_contents = json.load(file)
106
- current_file_data_contents = file_data.model_dump()
106
+ current_file_data_contents = json.loads(file_data.model_dump_json())
107
107
  expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
108
108
  current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
109
109
  diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
@@ -184,7 +184,7 @@ def update_fixtures(
184
184
  for file_data in all_file_data:
185
185
  file_data_path = file_data_output_path / f"{file_data.identifier}.json"
186
186
  with file_data_path.open(mode="w") as f:
187
- json.dump(file_data.model_dump(), f, indent=2)
187
+ f.write(file_data.model_dump_json(indent=2))
188
188
 
189
189
  # Record file structure of download directory
190
190
  download_files = get_files(dir_path=download_dir)
@@ -216,7 +216,9 @@ def run_all_validations(
216
216
  len(predownload_file_data) == expected_number_indexed_file_data
217
217
  ), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
218
218
  if expected_num_files := configs.expected_num_files:
219
- assert len(postdownload_file_data) == expected_num_files
219
+ assert (
220
+ len(postdownload_file_data) == expected_num_files
221
+ ), f"expected {expected_num_files} but got {len(postdownload_file_data)}"
220
222
 
221
223
  for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
222
224
  configs.run_file_data_validation(
@@ -11,7 +11,7 @@ def test_connection_config_multiple_auth():
11
11
  with pytest.raises(ValidationError):
12
12
  ConfluenceConnectionConfig(
13
13
  access_config=ConfluenceAccessConfig(
14
- password="api_token",
14
+ password="password",
15
15
  token="access_token",
16
16
  ),
17
17
  username="user_email",
@@ -19,14 +19,46 @@ def test_connection_config_multiple_auth():
19
19
  )
20
20
 
21
21
 
22
+ def test_connection_config_multiple_auth2():
23
+ with pytest.raises(ValidationError):
24
+ ConfluenceConnectionConfig(
25
+ access_config=ConfluenceAccessConfig(
26
+ api_token="api_token",
27
+ token="access_token",
28
+ ),
29
+ username="user_email",
30
+ url="url",
31
+ )
32
+
33
+
34
+ def test_connection_config_multiple_auth3():
35
+ with pytest.raises(ValidationError):
36
+ ConfluenceConnectionConfig(
37
+ access_config=ConfluenceAccessConfig(
38
+ api_token="api_token",
39
+ password="password",
40
+ ),
41
+ username="user_email",
42
+ url="url",
43
+ )
44
+
45
+
22
46
  def test_connection_config_no_auth():
23
47
  with pytest.raises(ValidationError):
24
48
  ConfluenceConnectionConfig(access_config=ConfluenceAccessConfig(), url="url")
25
49
 
26
50
 
27
- def test_connection_config_basic_auth():
51
+ def test_connection_config_password_auth():
52
+ ConfluenceConnectionConfig(
53
+ access_config=ConfluenceAccessConfig(password="password"),
54
+ url="url",
55
+ username="user_email",
56
+ )
57
+
58
+
59
+ def test_connection_config_api_token_auth():
28
60
  ConfluenceConnectionConfig(
29
- access_config=ConfluenceAccessConfig(password="api_token"),
61
+ access_config=ConfluenceAccessConfig(api_token="api_token"),
30
62
  url="url",
31
63
  username="user_email",
32
64
  )
@@ -1 +1 @@
1
- __version__ = "0.5.14" # pragma: no cover
1
+ __version__ = "0.5.16" # pragma: no cover
@@ -1,4 +1,4 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import dataclass, field
2
2
  from typing import TYPE_CHECKING, Optional
3
3
 
4
4
  from pydantic import Field
@@ -15,14 +15,11 @@ if TYPE_CHECKING:
15
15
 
16
16
 
17
17
  class HuggingFaceEmbeddingConfig(EmbeddingConfig):
18
- embedder_model_name: Optional[str] = Field(
19
- default="sentence-transformers/all-MiniLM-L6-v2", alias="model_name"
20
- )
18
+ embedder_model_name: Optional[str] = Field(default="all-MiniLM-L6-v2", alias="model_name")
21
19
  embedder_model_kwargs: Optional[dict] = Field(
22
20
  default_factory=lambda: {"device": "cpu"}, alias="model_kwargs"
23
21
  )
24
22
  encode_kwargs: Optional[dict] = Field(default_factory=lambda: {"normalize_embeddings": False})
25
- cache_folder: Optional[str] = Field(default=None)
26
23
 
27
24
  @requires_dependencies(
28
25
  ["sentence_transformers"],
@@ -33,7 +30,6 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
33
30
 
34
31
  return SentenceTransformer(
35
32
  model_name_or_path=self.embedder_model_name,
36
- cache_folder=self.cache_folder,
37
33
  **self.embedder_model_kwargs,
38
34
  )
39
35
 
@@ -45,7 +41,7 @@ class HuggingFaceEmbeddingConfig(EmbeddingConfig):
45
41
 
46
42
  @dataclass
47
43
  class HuggingFaceEmbeddingEncoder(BaseEmbeddingEncoder):
48
- config: HuggingFaceEmbeddingConfig
44
+ config: HuggingFaceEmbeddingConfig = field(default_factory=HuggingFaceEmbeddingConfig)
49
45
 
50
46
  def _embed_query(self, query: str) -> list[float]:
51
47
  return self._embed_documents(texts=[query])[0]
@@ -2,7 +2,7 @@ import itertools
2
2
  import json
3
3
  from datetime import datetime
4
4
  from pathlib import Path
5
- from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
5
+ from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, Union, cast
6
6
 
7
7
  import pandas as pd
8
8
 
@@ -163,7 +163,9 @@ def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
163
163
  raise IOError("Unsupported file type: {path}")
164
164
 
165
165
 
166
- def get_data(path: Path) -> list[dict]:
166
+ def get_data(path: Union[Path, str]) -> list[dict]:
167
+ if isinstance(path, str):
168
+ path = Path(path)
167
169
  try:
168
170
  return get_data_by_suffix(path=path)
169
171
  except Exception as e:
@@ -102,7 +102,7 @@ def file_data_from_file(path: str) -> FileData:
102
102
  try:
103
103
  return BatchFileData.from_file(path=path)
104
104
  except ValidationError:
105
- logger.debug(f"{path} not valid for batch file data")
105
+ logger.debug(f"{path} not detected as batch file data")
106
106
 
107
107
  return FileData.from_file(path=path)
108
108
 
@@ -1,4 +1,3 @@
1
- import json
2
1
  from abc import ABC
3
2
  from dataclasses import dataclass
4
3
  from pathlib import Path
@@ -7,6 +6,7 @@ from typing import Any, TypeVar
7
6
  from pydantic import BaseModel
8
7
 
9
8
  from unstructured_ingest.utils import ndjson
9
+ from unstructured_ingest.utils.data_prep import get_data, write_data
10
10
  from unstructured_ingest.v2.interfaces.file_data import FileData
11
11
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
12
 
@@ -43,16 +43,13 @@ class UploadStager(BaseProcess, ABC):
43
43
  writer.f.flush()
44
44
 
45
45
  def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
46
- with input_file.open() as in_f:
47
- elements_contents = json.load(in_f)
46
+ elements_contents = get_data(path=input_file)
48
47
 
49
48
  conformed_elements = [
50
49
  self.conform_dict(element_dict=element, file_data=file_data)
51
50
  for element in elements_contents
52
51
  ]
53
-
54
- with open(output_file, "w") as out_f:
55
- json.dump(conformed_elements, out_f, indent=2)
52
+ write_data(path=output_file, data=conformed_elements)
56
53
 
57
54
  def run(
58
55
  self,
@@ -108,6 +108,13 @@ class Pipeline:
108
108
  uploader_connector_type = self.uploader_step.process.connector_type
109
109
  registry_entry = destination_registry[uploader_connector_type]
110
110
  if registry_entry.upload_stager and self.stager_step is None:
111
+ try:
112
+ self.stager_step = UploadStageStep(
113
+ process=registry_entry.upload_stager(), context=self.context
114
+ )
115
+ return
116
+ except Exception as e:
117
+ logger.debug(f"failed to instantiate required stager on user's behalf: {e}")
111
118
  raise ValueError(
112
119
  f"pipeline with uploader type {self.uploader_step.process.__class__.__name__} "
113
120
  f"expects a stager of type {registry_entry.upload_stager.__name__} "
@@ -38,7 +38,7 @@ class ChunkStep(PipelineStep):
38
38
  return not filepath.exists()
39
39
 
40
40
  def get_output_filepath(self, filename: Path) -> Path:
41
- hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
42
42
  filepath = (self.cache_dir / hashed_output_file).resolve()
43
43
  filepath.parent.mkdir(parents=True, exist_ok=True)
44
44
  return filepath
@@ -88,9 +88,9 @@ class DownloadStep(PipelineStep):
88
88
  f"match size of local file: {file_size_bytes}, updating"
89
89
  )
90
90
  file_data.metadata.filesize_bytes = file_size_bytes
91
- logger.debug(f"updating file data with new content: {file_data.model_dump()}")
91
+ logger.debug(f"updating file data with new content: {file_data.model_dump_json()}")
92
92
  with file_data_path.open("w") as file:
93
- json.dump(file_data.model_dump(), file, indent=2)
93
+ file.write(file_data.model_dump_json(indent=2))
94
94
 
95
95
  async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
96
96
  file_data = file_data_from_file(path=file_data_path)
@@ -173,7 +173,7 @@ class DownloadStep(PipelineStep):
173
173
  filepath = (self.cache_dir / filename).resolve()
174
174
  filepath.parent.mkdir(parents=True, exist_ok=True)
175
175
  with open(str(filepath), "w") as f:
176
- json.dump(file_data.model_dump(), f, indent=2)
176
+ f.write(file_data.model_dump_json(indent=2))
177
177
  return str(filepath)
178
178
 
179
179
  def get_hash(self, extras: Optional[list[str]]) -> str:
@@ -38,7 +38,7 @@ class EmbedStep(PipelineStep):
38
38
  return not filepath.exists()
39
39
 
40
40
  def get_output_filepath(self, filename: Path) -> Path:
41
- hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
42
42
  filepath = (self.cache_dir / hashed_output_file).resolve()
43
43
  filepath.parent.mkdir(parents=True, exist_ok=True)
44
44
  return filepath
@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
37
37
  @instrument(span_name=STEP_ID)
38
38
  def run(self) -> Generator[str, None, None]:
39
39
  for file_data in self.process.run():
40
- logger.debug(f"generated file data: {file_data.model_dump()}")
40
+ logger.debug(f"generated file data: {file_data.model_dump_json()}")
41
41
  try:
42
42
  record_hash = self.get_hash(extras=[file_data.identifier])
43
43
  filename = f"{record_hash}.json"
44
44
  filepath = (self.cache_dir / filename).resolve()
45
45
  filepath.parent.mkdir(parents=True, exist_ok=True)
46
46
  with open(str(filepath), "w") as f:
47
- json.dump(file_data.model_dump(), f, indent=2)
47
+ f.write(file_data.model_dump_json(indent=2))
48
48
  yield str(filepath)
49
49
  except Exception as e:
50
50
  logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
54
54
 
55
55
  async def run_async(self) -> AsyncGenerator[str, None]:
56
56
  async for file_data in self.process.run_async():
57
- logger.debug(f"generated file data: {file_data.model_dump()}")
57
+ logger.debug(f"generated file data: {file_data.model_dump_json()}")
58
58
  try:
59
59
  record_hash = self.get_hash(extras=[file_data.identifier])
60
60
  filename = f"{record_hash}.json"
61
61
  filepath = (self.cache_dir / filename).resolve()
62
62
  filepath.parent.mkdir(parents=True, exist_ok=True)
63
63
  with open(str(filepath), "w") as f:
64
- json.dump(file_data.model_dump(), f, indent=2)
64
+ f.write(file_data.model_dump_json(indent=2))
65
65
  yield str(filepath)
66
66
  except Exception as e:
67
67
  logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -38,7 +38,7 @@ class PartitionStep(PipelineStep):
38
38
  return not filepath.exists()
39
39
 
40
40
  def get_output_filepath(self, filename: Path) -> Path:
41
- hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
41
+ hashed_output_file = f"{self.get_hash(extras=[filename.name])}.ndjson"
42
42
  filepath = (self.cache_dir / hashed_output_file).resolve()
43
43
  filepath.parent.mkdir(parents=True, exist_ok=True)
44
44
  return filepath
@@ -35,7 +35,11 @@ CONNECTOR_TYPE = "confluence"
35
35
 
36
36
  class ConfluenceAccessConfig(AccessConfig):
37
37
  password: Optional[str] = Field(
38
- description="Confluence password or Cloud API token",
38
+ description="Confluence password",
39
+ default=None,
40
+ )
41
+ api_token: Optional[str] = Field(
42
+ description="Confluence Cloud API token",
39
43
  default=None,
40
44
  )
41
45
  token: Optional[str] = Field(
@@ -57,7 +61,12 @@ class ConfluenceConnectionConfig(ConnectionConfig):
57
61
 
58
62
  def model_post_init(self, __context):
59
63
  access_configs = self.access_config.get_secret_value()
60
- basic_auth = self.username and access_configs.password
64
+ if access_configs.password and access_configs.api_token:
65
+ raise ValueError(
66
+ "both password and api_token provided, only one allowed, "
67
+ "see: https://atlassian-python-api.readthedocs.io/"
68
+ )
69
+ basic_auth = bool(self.username and (access_configs.password or access_configs.api_token))
61
70
  pat_auth = access_configs.token
62
71
  if self.cloud and not basic_auth:
63
72
  raise ValueError(
@@ -74,6 +83,14 @@ class ConfluenceConnectionConfig(ConnectionConfig):
74
83
  "no form of auth provided, see: https://atlassian-python-api.readthedocs.io/"
75
84
  )
76
85
 
86
+ def password_or_api_token(self) -> str:
87
+ # Confluence takes either password or API token under the same field: password
88
+ # This ambiguity led to confusion, so we are making it specific what you are passing in
89
+ access_configs = self.access_config.get_secret_value()
90
+ if access_configs.password:
91
+ return access_configs.password
92
+ return access_configs.api_token
93
+
77
94
  @requires_dependencies(["atlassian"], extras="confluence")
78
95
  @contextmanager
79
96
  def get_client(self) -> "Confluence":
@@ -83,7 +100,7 @@ class ConfluenceConnectionConfig(ConnectionConfig):
83
100
  with Confluence(
84
101
  url=self.url,
85
102
  username=self.username,
86
- password=access_configs.password,
103
+ password=self.password_or_api_token(),
87
104
  token=access_configs.token,
88
105
  cloud=self.cloud,
89
106
  ) as client:
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes_aws"
22
26
 
@@ -76,6 +80,8 @@ databricks_aws_volumes_destination_entry = DestinationRegistryEntry(
76
80
  connection_config=DatabricksAWSVolumesConnectionConfig,
77
81
  uploader=DatabricksAWSVolumesUploader,
78
82
  uploader_config=DatabricksAWSVolumesUploaderConfig,
83
+ upload_stager_config=BlobStoreUploadStagerConfig,
84
+ upload_stager=BlobStoreUploadStager,
79
85
  )
80
86
 
81
87
  databricks_aws_volumes_source_entry = SourceRegistryEntry(
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes_azure"
22
26
 
@@ -91,6 +95,8 @@ databricks_azure_volumes_destination_entry = DestinationRegistryEntry(
91
95
  connection_config=DatabricksAzureVolumesConnectionConfig,
92
96
  uploader=DatabricksAzureVolumesUploader,
93
97
  uploader_config=DatabricksAzureVolumesUploaderConfig,
98
+ upload_stager_config=BlobStoreUploadStagerConfig,
99
+ upload_stager=BlobStoreUploadStager,
94
100
  )
95
101
 
96
102
  databricks_azure_volumes_source_entry = SourceRegistryEntry(
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes_gcp"
22
26
 
@@ -74,6 +78,8 @@ databricks_gcp_volumes_destination_entry = DestinationRegistryEntry(
74
78
  connection_config=DatabricksGoogleVolumesConnectionConfig,
75
79
  uploader=DatabricksGoogleVolumesUploader,
76
80
  uploader_config=DatabricksGoogleVolumesUploaderConfig,
81
+ upload_stager_config=BlobStoreUploadStagerConfig,
82
+ upload_stager=BlobStoreUploadStager,
77
83
  )
78
84
 
79
85
  databricks_gcp_volumes_source_entry = SourceRegistryEntry(
@@ -17,6 +17,10 @@ from unstructured_ingest.v2.processes.connectors.databricks.volumes import (
17
17
  DatabricksVolumesUploader,
18
18
  DatabricksVolumesUploaderConfig,
19
19
  )
20
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
21
+ BlobStoreUploadStager,
22
+ BlobStoreUploadStagerConfig,
23
+ )
20
24
 
21
25
  CONNECTOR_TYPE = "databricks_volumes"
22
26
 
@@ -75,6 +79,8 @@ databricks_native_volumes_destination_entry = DestinationRegistryEntry(
75
79
  connection_config=DatabricksNativeVolumesConnectionConfig,
76
80
  uploader=DatabricksNativeVolumesUploader,
77
81
  uploader_config=DatabricksNativeVolumesUploaderConfig,
82
+ upload_stager_config=BlobStoreUploadStagerConfig,
83
+ upload_stager=BlobStoreUploadStager,
78
84
  )
79
85
 
80
86
  databricks_native_volumes_source_entry = SourceRegistryEntry(
@@ -61,7 +61,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
61
61
  self.upload_config.database, ", ".join(databases)
62
62
  )
63
63
  )
64
- cursor.execute("SHOW TABLES")
64
+ cursor.execute(f"SHOW TABLES IN {self.upload_config.database}")
65
65
  table_names = [r[1] for r in cursor.fetchall()]
66
66
  if self.upload_config.table_name not in table_names:
67
67
  raise ValueError(
@@ -26,6 +26,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
26
26
  FsspecUploaderConfig,
27
27
  )
28
28
  from unstructured_ingest.v2.processes.connectors.fsspec.utils import json_serial, sterilize_dict
29
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
30
+ BlobStoreUploadStager,
31
+ BlobStoreUploadStagerConfig,
32
+ )
29
33
 
30
34
  if TYPE_CHECKING:
31
35
  from adlfs import AzureBlobFileSystem
@@ -194,4 +198,6 @@ azure_destination_entry = DestinationRegistryEntry(
194
198
  uploader=AzureUploader,
195
199
  uploader_config=AzureUploaderConfig,
196
200
  connection_config=AzureConnectionConfig,
201
+ upload_stager_config=BlobStoreUploadStagerConfig,
202
+ upload_stager=BlobStoreUploadStager,
197
203
  )
@@ -28,6 +28,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
28
28
  FsspecUploaderConfig,
29
29
  )
30
30
  from unstructured_ingest.v2.processes.connectors.utils import conform_string_to_dict
31
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
32
+ BlobStoreUploadStager,
33
+ BlobStoreUploadStagerConfig,
34
+ )
31
35
 
32
36
  if TYPE_CHECKING:
33
37
  from boxfs import BoxFileSystem
@@ -167,4 +171,6 @@ box_destination_entry = DestinationRegistryEntry(
167
171
  uploader=BoxUploader,
168
172
  uploader_config=BoxUploaderConfig,
169
173
  connection_config=BoxConnectionConfig,
174
+ upload_stager_config=BlobStoreUploadStagerConfig,
175
+ upload_stager=BlobStoreUploadStager,
170
176
  )
@@ -31,6 +31,10 @@ from unstructured_ingest.v2.processes.connectors.fsspec.fsspec import (
31
31
  FsspecUploader,
32
32
  FsspecUploaderConfig,
33
33
  )
34
+ from unstructured_ingest.v2.processes.utils.blob_storage import (
35
+ BlobStoreUploadStager,
36
+ BlobStoreUploadStagerConfig,
37
+ )
34
38
 
35
39
  if TYPE_CHECKING:
36
40
  pass
@@ -228,4 +232,6 @@ dropbox_destination_entry = DestinationRegistryEntry(
228
232
  uploader=DropboxUploader,
229
233
  uploader_config=DropboxUploaderConfig,
230
234
  connection_config=DropboxConnectionConfig,
235
+ upload_stager_config=BlobStoreUploadStagerConfig,
236
+ upload_stager=BlobStoreUploadStager,
231
237
  )