unstructured-ingest 0.5.15__py3-none-any.whl → 0.5.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  from pathlib import Path
3
- from typing import Optional
4
3
 
5
4
  import pytest
6
5
 
@@ -21,20 +20,20 @@ from unstructured_ingest.v2.processes.connectors.zendesk.zendesk import (
21
20
  ZendeskIndexerConfig,
22
21
  )
23
22
 
23
+ SUBDOMAIN = "unstructuredhelp"
24
+ EMAIL = "test@unstructured.io"
24
25
 
25
- async def zendesk_source_test(
26
- tmp_path: Path,
27
- token: Optional[str] = None,
28
- email: Optional[str] = None,
29
- subdomain: Optional[str] = None,
30
- ):
31
26
 
32
- access_config = ZendeskAccessConfig(api_token=token)
27
+ @pytest.mark.asyncio
28
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
29
+ @requires_env("ZENDESK_TOKEN")
30
+ async def test_zendesk_source_tickets(temp_dir: Path):
31
+ access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
33
32
  connection_config = ZendeskConnectionConfig(
34
- subdomain=subdomain, email=email, access_config=access_config
33
+ subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
35
34
  )
36
35
 
37
- index_config = ZendeskIndexerConfig(batch_size=2, item_type="tickets")
36
+ index_config = ZendeskIndexerConfig(item_type="tickets")
38
37
 
39
38
  indexer = ZendeskIndexer(
40
39
  connection_config=connection_config,
@@ -43,7 +42,7 @@ async def zendesk_source_test(
43
42
  )
44
43
 
45
44
  # handle downloader.
46
- download_config = ZendeskDownloaderConfig(download_dir=tmp_path)
45
+ download_config = ZendeskDownloaderConfig(download_dir=temp_dir)
47
46
 
48
47
  downloader = ZendeskDownloader(
49
48
  connection_config=connection_config,
@@ -57,26 +56,23 @@ async def zendesk_source_test(
57
56
  downloader=downloader,
58
57
  configs=SourceValidationConfigs(
59
58
  test_id="zendesk-tickets",
60
- expected_num_files=4,
59
+ expected_num_files=8,
61
60
  validate_file_data=False,
62
61
  validate_downloaded_files=True,
63
62
  ),
64
63
  )
65
64
 
66
65
 
67
- async def zendesk_source_articles_test(
68
- tmp_path: Path,
69
- token: Optional[str] = None,
70
- email: Optional[str] = None,
71
- subdomain: Optional[str] = None,
72
- ):
73
-
74
- access_config = ZendeskAccessConfig(api_token=token)
66
+ @pytest.mark.asyncio
67
+ @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
68
+ @requires_env("ZENDESK_TOKEN")
69
+ async def test_zendesk_source_articles(temp_dir):
70
+ access_config = ZendeskAccessConfig(api_token=os.environ["ZENDESK_TOKEN"])
75
71
  connection_config = ZendeskConnectionConfig(
76
- subdomain=subdomain, email=email, access_config=access_config
72
+ subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
77
73
  )
78
74
 
79
- index_config = ZendeskIndexerConfig(batch_size=2, item_type="articles")
75
+ index_config = ZendeskIndexerConfig(item_type="articles")
80
76
 
81
77
  indexer = ZendeskIndexer(
82
78
  connection_config=connection_config,
@@ -85,7 +81,7 @@ async def zendesk_source_articles_test(
85
81
  )
86
82
 
87
83
  # handle downloader.
88
- download_config = ZendeskDownloaderConfig(download_dir=tmp_path, extract_images=True)
84
+ download_config = ZendeskDownloaderConfig(download_dir=temp_dir, extract_images=True)
89
85
 
90
86
  downloader = ZendeskDownloader(
91
87
  connection_config=connection_config,
@@ -99,44 +95,26 @@ async def zendesk_source_articles_test(
99
95
  downloader=downloader,
100
96
  configs=SourceValidationConfigs(
101
97
  test_id="zendesk-articles",
102
- expected_num_files=4,
103
- validate_file_data=False,
98
+ expected_num_files=8,
99
+ validate_file_data=True,
104
100
  validate_downloaded_files=True,
105
101
  ),
106
102
  )
107
103
 
108
104
 
109
- @pytest.mark.asyncio
110
105
  @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
111
- @requires_env("ZENDESK_TOKEN")
112
- async def test_zendesk_source(temp_dir):
113
- await zendesk_source_test(
114
- tmp_path=temp_dir,
115
- token=os.environ["ZENDESK_TOKEN"],
116
- email="test@unstructured.io",
117
- subdomain="unstructuredhelp",
106
+ def test_zendesk_source_articles_fail(temp_dir):
107
+ access_config = ZendeskAccessConfig(api_token="FAKE_TOKEN")
108
+ connection_config = ZendeskConnectionConfig(
109
+ subdomain=SUBDOMAIN, email=EMAIL, access_config=access_config
118
110
  )
119
111
 
112
+ index_config = ZendeskIndexerConfig(item_type="tickets")
120
113
 
121
- @pytest.mark.asyncio
122
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
123
- @requires_env("ZENDESK_TOKEN")
124
- async def test_zendesk_source_articles(temp_dir):
125
- await zendesk_source_articles_test(
126
- tmp_path=temp_dir,
127
- token=os.environ["ZENDESK_TOKEN"],
128
- email="test@unstructured.io",
129
- subdomain="unstructuredhelp",
114
+ indexer = ZendeskIndexer(
115
+ connection_config=connection_config,
116
+ index_config=index_config,
117
+ connector_type=CONNECTOR_TYPE,
130
118
  )
131
-
132
-
133
- @pytest.mark.asyncio
134
- @pytest.mark.tags(SOURCE_TAG, CONNECTOR_TYPE, UNCATEGORIZED_TAG)
135
- async def test_zendesk_source_articles_fail(temp_dir):
136
119
  with pytest.raises(expected_exception=UserAuthError):
137
- await zendesk_source_articles_test(
138
- tmp_path=temp_dir,
139
- token="FORCE_FAIL_TOKEN",
140
- email="test@unstructured.io",
141
- subdomain="unstructuredhelp",
142
- )
120
+ indexer.precheck()
@@ -103,7 +103,7 @@ def check_contents(
103
103
  file_data_path = expected_output_dir / f"{file_data.identifier}.json"
104
104
  with file_data_path.open("r") as file:
105
105
  expected_file_data_contents = json.load(file)
106
- current_file_data_contents = file_data.model_dump()
106
+ current_file_data_contents = json.loads(file_data.model_dump_json())
107
107
  expected_file_data_contents = configs.omit_ignored_fields(expected_file_data_contents)
108
108
  current_file_data_contents = configs.omit_ignored_fields(current_file_data_contents)
109
109
  diff = DeepDiff(expected_file_data_contents, current_file_data_contents)
@@ -184,7 +184,7 @@ def update_fixtures(
184
184
  for file_data in all_file_data:
185
185
  file_data_path = file_data_output_path / f"{file_data.identifier}.json"
186
186
  with file_data_path.open(mode="w") as f:
187
- json.dump(file_data.model_dump(), f, indent=2)
187
+ f.write(file_data.model_dump_json(indent=2))
188
188
 
189
189
  # Record file structure of download directory
190
190
  download_files = get_files(dir_path=download_dir)
@@ -216,7 +216,9 @@ def run_all_validations(
216
216
  len(predownload_file_data) == expected_number_indexed_file_data
217
217
  ), f"expected {expected_number_indexed_file_data} but got {len(predownload_file_data)}"
218
218
  if expected_num_files := configs.expected_num_files:
219
- assert len(postdownload_file_data) == expected_num_files
219
+ assert (
220
+ len(postdownload_file_data) == expected_num_files
221
+ ), f"expected {expected_num_files} but got {len(postdownload_file_data)}"
220
222
 
221
223
  for pre_data, post_data in zip(predownload_file_data, postdownload_file_data):
222
224
  configs.run_file_data_validation(
@@ -1 +1 @@
1
- __version__ = "0.5.15" # pragma: no cover
1
+ __version__ = "0.5.16" # pragma: no cover
@@ -88,9 +88,9 @@ class DownloadStep(PipelineStep):
88
88
  f"match size of local file: {file_size_bytes}, updating"
89
89
  )
90
90
  file_data.metadata.filesize_bytes = file_size_bytes
91
- logger.debug(f"updating file data with new content: {file_data.model_dump()}")
91
+ logger.debug(f"updating file data with new content: {file_data.model_dump_json()}")
92
92
  with file_data_path.open("w") as file:
93
- json.dump(file_data.model_dump(), file, indent=2)
93
+ file.write(file_data.model_dump_json(indent=2))
94
94
 
95
95
  async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
96
96
  file_data = file_data_from_file(path=file_data_path)
@@ -173,7 +173,7 @@ class DownloadStep(PipelineStep):
173
173
  filepath = (self.cache_dir / filename).resolve()
174
174
  filepath.parent.mkdir(parents=True, exist_ok=True)
175
175
  with open(str(filepath), "w") as f:
176
- json.dump(file_data.model_dump(), f, indent=2)
176
+ f.write(file_data.model_dump_json(indent=2))
177
177
  return str(filepath)
178
178
 
179
179
  def get_hash(self, extras: Optional[list[str]]) -> str:
@@ -37,14 +37,14 @@ class IndexStep(PipelineStep):
37
37
  @instrument(span_name=STEP_ID)
38
38
  def run(self) -> Generator[str, None, None]:
39
39
  for file_data in self.process.run():
40
- logger.debug(f"generated file data: {file_data.model_dump()}")
40
+ logger.debug(f"generated file data: {file_data.model_dump_json()}")
41
41
  try:
42
42
  record_hash = self.get_hash(extras=[file_data.identifier])
43
43
  filename = f"{record_hash}.json"
44
44
  filepath = (self.cache_dir / filename).resolve()
45
45
  filepath.parent.mkdir(parents=True, exist_ok=True)
46
46
  with open(str(filepath), "w") as f:
47
- json.dump(file_data.model_dump(), f, indent=2)
47
+ f.write(file_data.model_dump_json(indent=2))
48
48
  yield str(filepath)
49
49
  except Exception as e:
50
50
  logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -54,14 +54,14 @@ class IndexStep(PipelineStep):
54
54
 
55
55
  async def run_async(self) -> AsyncGenerator[str, None]:
56
56
  async for file_data in self.process.run_async():
57
- logger.debug(f"generated file data: {file_data.model_dump()}")
57
+ logger.debug(f"generated file data: {file_data.model_dump_json()}")
58
58
  try:
59
59
  record_hash = self.get_hash(extras=[file_data.identifier])
60
60
  filename = f"{record_hash}.json"
61
61
  filepath = (self.cache_dir / filename).resolve()
62
62
  filepath.parent.mkdir(parents=True, exist_ok=True)
63
63
  with open(str(filepath), "w") as f:
64
- json.dump(file_data.model_dump(), f, indent=2)
64
+ f.write(file_data.model_dump_json(indent=2))
65
65
  yield str(filepath)
66
66
  except Exception as e:
67
67
  logger.error(f"failed to create index for file data: {file_data}", exc_info=True)
@@ -61,7 +61,7 @@ class DatabricksVolumeDeltaTableUploader(Uploader):
61
61
  self.upload_config.database, ", ".join(databases)
62
62
  )
63
63
  )
64
- cursor.execute("SHOW TABLES")
64
+ cursor.execute(f"SHOW TABLES IN {self.upload_config.database}")
65
65
  table_names = [r[1] for r in cursor.fetchall()]
66
66
  if self.upload_config.table_name not in table_names:
67
67
  raise ValueError(