unstructured-ingest 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (30) hide show
  1. test/integration/connectors/utils/validation/equality.py +2 -1
  2. test/unit/v2/connectors/sql/test_sql.py +4 -2
  3. unstructured_ingest/__version__.py +1 -1
  4. unstructured_ingest/utils/data_prep.py +11 -3
  5. unstructured_ingest/utils/html.py +109 -0
  6. unstructured_ingest/utils/ndjson.py +52 -0
  7. unstructured_ingest/v2/interfaces/upload_stager.py +3 -13
  8. unstructured_ingest/v2/pipeline/steps/chunk.py +3 -4
  9. unstructured_ingest/v2/pipeline/steps/embed.py +3 -4
  10. unstructured_ingest/v2/pipeline/steps/partition.py +3 -4
  11. unstructured_ingest/v2/processes/connectors/confluence.py +95 -25
  12. unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -2
  13. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -8
  14. unstructured_ingest/v2/processes/connectors/fsspec/box.py +7 -7
  15. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +9 -9
  16. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +41 -9
  17. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +7 -7
  18. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +8 -8
  19. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +5 -5
  20. unstructured_ingest/v2/processes/connectors/sql/__init__.py +4 -0
  21. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +2 -1
  22. unstructured_ingest/v2/processes/connectors/sql/sql.py +12 -8
  23. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +2 -1
  24. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +270 -0
  25. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/METADATA +25 -22
  26. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/RECORD +30 -27
  27. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/LICENSE.md +0 -0
  28. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/WHEEL +0 -0
  29. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/entry_points.txt +0 -0
  30. {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,11 @@
1
1
  import json
2
2
  from pathlib import Path
3
3
 
4
- import ndjson
5
4
  from bs4 import BeautifulSoup
6
5
  from deepdiff import DeepDiff
7
6
 
7
+ from unstructured_ingest.utils import ndjson
8
+
8
9
 
9
10
  def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
10
11
  with expected_filepath.open() as f:
@@ -47,7 +47,9 @@ def test_run_output_filename_suffix(
47
47
  mock_get_output_path = mocker.patch.object(
48
48
  SQLUploadStager, "get_output_path", return_value=output_dir / expected
49
49
  )
50
- mock_write_output = mocker.patch.object(SQLUploadStager, "write_output")
50
+ mock_write_output = mocker.patch(
51
+ "unstructured_ingest.v2.processes.connectors.sql.sql.write_data", return_value=None
52
+ )
51
53
 
52
54
  # Act
53
55
  result = mock_instance.run(
@@ -67,6 +69,6 @@ def test_run_output_filename_suffix(
67
69
  mock_conform_dataframe.assert_called_once()
68
70
  mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
69
71
  mock_write_output.assert_called_once_with(
70
- output_path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
72
+ path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
71
73
  )
72
74
  assert result.name == expected
@@ -1 +1 @@
1
- __version__ = "0.4.0" # pragma: no cover
1
+ __version__ = "0.4.1" # pragma: no cover
@@ -4,9 +4,9 @@ from datetime import datetime
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
6
6
 
7
- import ndjson
8
7
  import pandas as pd
9
8
 
9
+ from unstructured_ingest.utils import ndjson
10
10
  from unstructured_ingest.v2.logger import logger
11
11
 
12
12
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
@@ -153,6 +153,16 @@ def get_data_by_suffix(path: Path) -> list[dict]:
153
153
  raise ValueError(f"Unsupported file type: {path}")
154
154
 
155
155
 
156
+ def write_data(path: Path, data: list[dict], indent: int = 2) -> None:
157
+ with path.open("w") as f:
158
+ if path.suffix == ".json":
159
+ json.dump(data, f, indent=indent, ensure_ascii=False)
160
+ elif path.suffix == ".ndjson":
161
+ ndjson.dump(data, f, ensure_ascii=False)
162
+ else:
163
+ raise IOError("Unsupported file type: {path}")
164
+
165
+
156
166
  def get_data(path: Path) -> list[dict]:
157
167
  try:
158
168
  return get_data_by_suffix(path=path)
@@ -179,8 +189,6 @@ def get_data(path: Path) -> list[dict]:
179
189
  except Exception as e:
180
190
  logger.warning(f"failed to read {path} as parquet: {e}")
181
191
 
182
- raise IOError(f"File could not be parsed: {path}")
183
-
184
192
 
185
193
  def get_data_df(path: Path) -> pd.DataFrame:
186
194
  with path.open() as f:
@@ -0,0 +1,109 @@
1
+ import base64
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ from urllib.parse import urlparse
5
+ from uuid import NAMESPACE_DNS, uuid5
6
+
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from requests import Session
10
+
11
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, SourceIdentifiers
12
+ from unstructured_ingest.v2.logger import logger
13
+
14
+
15
+ def convert_image_tags(url: str, original_html: str, session: Optional[Session] = None) -> str:
16
+ session = session or requests.Session()
17
+ parsed_url = urlparse(url)
18
+ base_url = parsed_url.scheme + "://" + parsed_url.netloc
19
+ soup = BeautifulSoup(original_html, "html.parser")
20
+ images = soup.find_all("img")
21
+ for image in images:
22
+ current_source = image["src"]
23
+ if current_source.startswith("//"):
24
+ source_url = f"{parsed_url.scheme}:{current_source}"
25
+ elif current_source.startswith("http"):
26
+ source_url = current_source
27
+ else:
28
+ source_url = base_url + current_source
29
+ try:
30
+ response = session.get(source_url)
31
+ response.raise_for_status()
32
+ image_content = response.content
33
+ logger.debug(
34
+ "img tag having src updated from {} to base64 content".format(image["src"])
35
+ )
36
+ image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
37
+ except Exception as e:
38
+ logger.warning(
39
+ f"failed to download image content from {source_url}: {e}", exc_info=True
40
+ )
41
+ return str(soup)
42
+
43
+
44
+ def download_link(
45
+ download_dir: Path, link: str, session: Optional[Session] = None, force_download: bool = False
46
+ ) -> Path:
47
+ session = session or requests.Session()
48
+ filename = Path(urlparse(url=link).path).name
49
+ download_path = download_dir / filename
50
+ logger.debug(f"downloading file from {link} to {download_path}")
51
+ if download_path.exists() and download_path.is_file() and not force_download:
52
+ return download_path
53
+ with download_path.open("wb") as downloaded_file:
54
+ response = session.get(link)
55
+ response.raise_for_status()
56
+ downloaded_file.write(response.content)
57
+ return download_path
58
+
59
+
60
+ def download_embedded_files(
61
+ download_dir: Path,
62
+ original_filedata: FileData,
63
+ original_html: str,
64
+ session: Optional[Session] = None,
65
+ force_download: bool = False,
66
+ ) -> list[DownloadResponse]:
67
+ session = session or requests.Session()
68
+ url = original_filedata.metadata.url
69
+ parsed_url = urlparse(url)
70
+ base_url = parsed_url.scheme + "://" + parsed_url.netloc
71
+ soup = BeautifulSoup(original_html, "html.parser")
72
+ tags = soup.find_all("a", href=True)
73
+ hrefs = [
74
+ tag["href"]
75
+ for tag in tags
76
+ if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
77
+ ]
78
+ results = []
79
+ for current_source in hrefs:
80
+ download_dir.mkdir(parents=True, exist_ok=True)
81
+ if current_source.startswith("//"):
82
+ source_url = f"{parsed_url.scheme}:{current_source}"
83
+ elif current_source.startswith("http"):
84
+ source_url = current_source
85
+ else:
86
+ source_url = base_url + current_source
87
+ try:
88
+ downloaded_path = download_link(
89
+ download_dir=download_dir,
90
+ link=source_url,
91
+ session=session,
92
+ force_download=force_download,
93
+ )
94
+ except Exception as e:
95
+ logger.warning(f"failed to download file content from {source_url}: {e}")
96
+ continue
97
+ result_file_data = original_filedata.model_copy(deep=True)
98
+ result_file_data.metadata.url = source_url
99
+ result_file_data.metadata.record_locator["parent_url"] = url
100
+ result_file_data.identifier = str(
101
+ uuid5(NAMESPACE_DNS, source_url + original_filedata.identifier)
102
+ )
103
+ filename = Path(urlparse(url=source_url).path).name
104
+ result_file_data.source_identifiers = SourceIdentifiers(
105
+ filename=filename, fullpath=filename
106
+ )
107
+ result_file_data.local_download_path = downloaded_path.as_posix()
108
+ results.append(DownloadResponse(file_data=result_file_data, path=downloaded_path))
109
+ return results
@@ -0,0 +1,52 @@
1
+ import json
2
+ from typing import IO, Any
3
+
4
+
5
+ def dumps(obj: list[dict[str, Any]], **kwargs) -> str:
6
+ return "\n".join(json.dumps(each, **kwargs) for each in obj)
7
+
8
+
9
+ def dump(obj: list[dict[str, Any]], fp: IO, **kwargs) -> None:
10
+ # Indent breaks ndjson formatting
11
+ kwargs["indent"] = None
12
+ text = dumps(obj, **kwargs)
13
+ fp.write(text)
14
+
15
+
16
+ def loads(s: str, **kwargs) -> list[dict[str, Any]]:
17
+ return [json.loads(line, **kwargs) for line in s.splitlines()]
18
+
19
+
20
+ def load(fp: IO, **kwargs) -> list[dict[str, Any]]:
21
+ return loads(fp.read(), **kwargs)
22
+
23
+
24
+ class writer(object):
25
+ def __init__(self, f, **kwargs):
26
+ self.f = f
27
+ self.kwargs = kwargs
28
+
29
+ def write(self, row):
30
+ stringified = json.dumps(row, **self.kwargs)
31
+ self.f.write(stringified + "\n")
32
+
33
+
34
+ class reader(object):
35
+ def __init__(self, f, **kwargs):
36
+ self.f = f
37
+ self.kwargs = kwargs
38
+
39
+ def __iter__(self):
40
+ return self
41
+
42
+ def __next__(self):
43
+ line = ""
44
+
45
+ while line == "":
46
+ line = next(self.f).strip()
47
+
48
+ return json.loads(line, **self.kwargs)
49
+
50
+ # NOTE: this is necessary to comply with py27
51
+ def next(self):
52
+ return self.__next__()
@@ -2,11 +2,11 @@ import json
2
2
  from abc import ABC
3
3
  from dataclasses import dataclass
4
4
  from pathlib import Path
5
- from typing import Any, Optional, TypeVar
5
+ from typing import Any, TypeVar
6
6
 
7
- import ndjson
8
7
  from pydantic import BaseModel
9
8
 
9
+ from unstructured_ingest.utils import ndjson
10
10
  from unstructured_ingest.v2.interfaces.file_data import FileData
11
11
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
12
 
@@ -22,16 +22,6 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
22
22
  class UploadStager(BaseProcess, ABC):
23
23
  upload_stager_config: UploadStagerConfigT
24
24
 
25
- def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
26
- if output_path.suffix == ".json":
27
- with output_path.open("w") as f:
28
- json.dump(data, f, indent=indent)
29
- elif output_path.suffix == ".ndjson":
30
- with output_path.open("w") as f:
31
- ndjson.dump(data, f)
32
- else:
33
- raise ValueError(f"Unsupported output format: {output_path}")
34
-
35
25
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
36
26
  return element_dict
37
27
 
@@ -49,7 +39,7 @@ class UploadStager(BaseProcess, ABC):
49
39
  writer = ndjson.writer(out_f)
50
40
  for element in reader:
51
41
  conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
52
- writer.writerow(row=conformed_element)
42
+ writer.write(row=conformed_element)
53
43
  writer.f.flush()
54
44
 
55
45
  def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
@@ -1,10 +1,10 @@
1
1
  import asyncio
2
2
  import hashlib
3
- import json
4
3
  from dataclasses import dataclass
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional, TypedDict
7
6
 
7
+ from unstructured_ingest.utils.data_prep import write_data
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
9
  from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
10
  from unstructured_ingest.v2.logger import logger
@@ -44,9 +44,8 @@ class ChunkStep(PipelineStep):
44
44
  return filepath
45
45
 
46
46
  def _save_output(self, output_filepath: str, chunked_content: list[dict]):
47
- with open(str(output_filepath), "w") as f:
48
- logger.debug(f"writing chunker output to: {output_filepath}")
49
- json.dump(chunked_content, f, indent=2)
47
+ logger.debug(f"writing chunker output to: {output_filepath}")
48
+ write_data(path=Path(output_filepath), data=chunked_content)
50
49
 
51
50
  async def _run_async(
52
51
  self, fn: Callable, path: str, file_data_path: str, **kwargs
@@ -1,10 +1,10 @@
1
1
  import asyncio
2
2
  import hashlib
3
- import json
4
3
  from dataclasses import dataclass
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional, TypedDict
7
6
 
7
+ from unstructured_ingest.utils.data_prep import write_data
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
9
  from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
10
  from unstructured_ingest.v2.logger import logger
@@ -44,9 +44,8 @@ class EmbedStep(PipelineStep):
44
44
  return filepath
45
45
 
46
46
  def _save_output(self, output_filepath: str, embedded_content: list[dict]):
47
- with open(str(output_filepath), "w") as f:
48
- logger.debug(f"writing embedded output to: {output_filepath}")
49
- json.dump(embedded_content, f, indent=2)
47
+ logger.debug(f"writing embedded output to: {output_filepath}")
48
+ write_data(path=Path(output_filepath), data=embedded_content)
50
49
 
51
50
  async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
52
51
  path = Path(path)
@@ -1,10 +1,10 @@
1
1
  import asyncio
2
2
  import hashlib
3
- import json
4
3
  from dataclasses import dataclass
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional, TypedDict
7
6
 
7
+ from unstructured_ingest.utils.data_prep import write_data
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
9
  from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
10
  from unstructured_ingest.v2.logger import logger
@@ -44,9 +44,8 @@ class PartitionStep(PipelineStep):
44
44
  return filepath
45
45
 
46
46
  def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
47
- with open(str(output_filepath), "w") as f:
48
- logger.debug(f"writing partitioned output to: {output_filepath}")
49
- json.dump(partitioned_content, f, indent=2)
47
+ logger.debug(f"writing partitioned output to: {output_filepath}")
48
+ write_data(path=Path(output_filepath), data=partitioned_content)
50
49
 
51
50
  async def _run_async(
52
51
  self, fn: Callable, path: str, file_data_path: str
@@ -1,3 +1,4 @@
1
+ from contextlib import contextmanager
1
2
  from dataclasses import dataclass, field
2
3
  from pathlib import Path
3
4
  from typing import TYPE_CHECKING, Generator, List, Optional
@@ -17,6 +18,7 @@ from unstructured_ingest.v2.interfaces import (
17
18
  Indexer,
18
19
  IndexerConfig,
19
20
  SourceIdentifiers,
21
+ download_responses,
20
22
  )
21
23
  from unstructured_ingest.v2.logger import logger
22
24
  from unstructured_ingest.v2.processes.connector_registry import (
@@ -71,17 +73,19 @@ class ConfluenceConnectionConfig(ConnectionConfig):
71
73
  )
72
74
 
73
75
  @requires_dependencies(["atlassian"], extras="confluence")
76
+ @contextmanager
74
77
  def get_client(self) -> "Confluence":
75
78
  from atlassian import Confluence
76
79
 
77
80
  access_configs = self.access_config.get_secret_value()
78
- return Confluence(
81
+ with Confluence(
79
82
  url=self.url,
80
83
  username=self.username,
81
84
  password=access_configs.password,
82
85
  token=access_configs.token,
83
86
  cloud=self.cloud,
84
- )
87
+ ) as client:
88
+ yield client
85
89
 
86
90
 
87
91
  class ConfluenceIndexerConfig(IndexerConfig):
@@ -103,8 +107,8 @@ class ConfluenceIndexer(Indexer):
103
107
 
104
108
  # Attempt to retrieve a list of spaces with limit=1.
105
109
  # This should only succeed if all creds are valid
106
- client = self.connection_config.get_client()
107
- client.get_all_spaces(limit=1)
110
+ with self.connection_config.get_client() as client:
111
+ client.get_all_spaces(limit=1)
108
112
  logger.info("Connection to Confluence successful.")
109
113
  return True
110
114
  except Exception as e:
@@ -116,21 +120,21 @@ class ConfluenceIndexer(Indexer):
116
120
  if spaces:
117
121
  return spaces
118
122
  else:
119
- client = self.connection_config.get_client()
120
- all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
123
+ with self.connection_config.get_client() as client:
124
+ all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
121
125
  space_ids = [space["key"] for space in all_spaces["results"]]
122
126
  return space_ids
123
127
 
124
128
  def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
125
- client = self.connection_config.get_client()
126
- pages = client.get_all_pages_from_space(
127
- space=space_id,
128
- start=0,
129
- limit=self.index_config.max_num_of_docs_from_each_space,
130
- expand=None,
131
- content_type="page",
132
- status=None,
133
- )
129
+ with self.connection_config.get_client() as client:
130
+ pages = client.get_all_pages_from_space(
131
+ space=space_id,
132
+ start=0,
133
+ limit=self.index_config.max_num_of_docs_from_each_space,
134
+ expand=None,
135
+ content_type="page",
136
+ status=None,
137
+ )
134
138
  doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
135
139
  return doc_ids
136
140
 
@@ -177,7 +181,18 @@ class ConfluenceIndexer(Indexer):
177
181
 
178
182
 
179
183
  class ConfluenceDownloaderConfig(DownloaderConfig):
180
- pass
184
+ extract_images: bool = Field(
185
+ default=False,
186
+ description="if true, will download images and replace "
187
+ "the html content with base64 encoded images",
188
+ )
189
+ extract_files: bool = Field(
190
+ default=False, description="if true, will download any embedded files"
191
+ )
192
+ force_download: bool = Field(
193
+ default=False,
194
+ description="if true, will redownload extracted files even if they already exist locally",
195
+ )
181
196
 
182
197
 
183
198
  @dataclass
@@ -186,14 +201,37 @@ class ConfluenceDownloader(Downloader):
186
201
  download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
187
202
  connector_type: str = CONNECTOR_TYPE
188
203
 
189
- def run(self, file_data: FileData, **kwargs) -> DownloadResponse:
204
+ def download_embedded_files(
205
+ self, session, html: str, current_file_data: FileData
206
+ ) -> list[DownloadResponse]:
207
+ if not self.download_config.extract_files:
208
+ return []
209
+ from unstructured_ingest.utils.html import download_embedded_files
210
+
211
+ filepath = current_file_data.source_identifiers.relative_path
212
+ download_path = Path(self.download_dir) / filepath
213
+ download_dir = download_path.with_suffix("")
214
+ return download_embedded_files(
215
+ download_dir=download_dir,
216
+ original_filedata=current_file_data,
217
+ original_html=html,
218
+ session=session,
219
+ force_download=self.download_config.force_download,
220
+ )
221
+
222
+ def run(self, file_data: FileData, **kwargs) -> download_responses:
223
+ from bs4 import BeautifulSoup
224
+
225
+ from unstructured_ingest.utils.html import convert_image_tags
226
+
190
227
  doc_id = file_data.identifier
191
228
  try:
192
- client = self.connection_config.get_client()
193
- page = client.get_page_by_id(
194
- page_id=doc_id,
195
- expand="history.lastUpdated,version,body.view",
196
- )
229
+ with self.connection_config.get_client() as client:
230
+ page = client.get_page_by_id(
231
+ page_id=doc_id,
232
+ expand="history.lastUpdated,version,body.view",
233
+ )
234
+
197
235
  except Exception as e:
198
236
  logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
199
237
  raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
@@ -202,20 +240,52 @@ class ConfluenceDownloader(Downloader):
202
240
  raise ValueError(f"Page with ID {doc_id} does not exist.")
203
241
 
204
242
  content = page["body"]["view"]["value"]
243
+ # This supports v2 html parsing in unstructured
244
+ title = page["title"]
245
+ title_html = f"<title>{title}</title>"
246
+ content = f"<body class='Document' >{title_html}{content}</body>"
247
+ if self.download_config.extract_images:
248
+ with self.connection_config.get_client() as client:
249
+ content = convert_image_tags(
250
+ url=file_data.metadata.url, original_html=content, session=client._session
251
+ )
205
252
 
206
253
  filepath = file_data.source_identifiers.relative_path
207
254
  download_path = Path(self.download_dir) / filepath
208
255
  download_path.parent.mkdir(parents=True, exist_ok=True)
209
256
  with open(download_path, "w", encoding="utf8") as f:
210
- f.write(content)
257
+ soup = BeautifulSoup(content, "html.parser")
258
+ f.write(soup.prettify())
211
259
 
212
260
  # Update file_data with metadata
213
261
  file_data.metadata.date_created = page["history"]["createdDate"]
214
262
  file_data.metadata.date_modified = page["version"]["when"]
215
263
  file_data.metadata.version = str(page["version"]["number"])
216
- file_data.display_name = page["title"]
264
+ file_data.display_name = title
217
265
 
218
- return self.generate_download_response(file_data=file_data, download_path=download_path)
266
+ download_response = self.generate_download_response(
267
+ file_data=file_data, download_path=download_path
268
+ )
269
+ if self.download_config.extract_files:
270
+ with self.connection_config.get_client() as client:
271
+ extracted_download_responses = self.download_embedded_files(
272
+ html=content,
273
+ current_file_data=download_response["file_data"],
274
+ session=client._session,
275
+ )
276
+ if extracted_download_responses:
277
+ for dr in extracted_download_responses:
278
+ fd = dr["file_data"]
279
+ source_file_path = Path(file_data.source_identifiers.fullpath).with_suffix(
280
+ ""
281
+ )
282
+ new_fullpath = source_file_path / fd.source_identifiers.filename
283
+ fd.source_identifiers = SourceIdentifiers(
284
+ fullpath=new_fullpath.as_posix(), filename=new_fullpath.name
285
+ )
286
+ extracted_download_responses.append(download_response)
287
+ return extracted_download_responses
288
+ return download_response
219
289
 
220
290
 
221
291
  confluence_source_entry = SourceRegistryEntry(
@@ -4,7 +4,7 @@ from typing import Any
4
4
 
5
5
  import pandas as pd
6
6
 
7
- from unstructured_ingest.utils.data_prep import get_data
7
+ from unstructured_ingest.utils.data_prep import get_data, write_data
8
8
  from unstructured_ingest.v2.interfaces import FileData, UploadStager
9
9
  from unstructured_ingest.v2.utils import get_enhanced_element_id
10
10
 
@@ -96,5 +96,5 @@ class BaseDuckDBUploadStager(UploadStager):
96
96
  df[column] = df[column].apply(str)
97
97
 
98
98
  data = df.to_dict(orient="records")
99
- self.write_output(output_path=output_path, data=data)
99
+ write_data(path=output_path, data=data)
100
100
  return output_path
@@ -128,22 +128,22 @@ class AzureIndexer(FsspecIndexer):
128
128
  def sterilize_info(self, file_data: dict) -> dict:
129
129
  return sterilize_dict(data=file_data, default=azure_json_serial)
130
130
 
131
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
132
- path = file_data["name"]
131
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
132
+ path = file_info["name"]
133
133
  date_created = (
134
- str(file_data.get("creation_time").timestamp())
135
- if "creation_time" in file_data
134
+ str(file_info.get("creation_time").timestamp())
135
+ if "creation_time" in file_info
136
136
  else None
137
137
  )
138
138
  date_modified = (
139
- str(file_data.get("last_modified").timestamp())
140
- if "last_modified" in file_data
139
+ str(file_info.get("last_modified").timestamp())
140
+ if "last_modified" in file_info
141
141
  else None
142
142
  )
143
143
 
144
- file_size = file_data.get("size") if "size" in file_data else None
144
+ file_size = file_info.get("size") if "size" in file_info else None
145
145
 
146
- version = file_data.get("etag")
146
+ version = file_info.get("etag")
147
147
  record_locator = {
148
148
  "protocol": self.index_config.protocol,
149
149
  "remote_file_path": self.index_config.remote_url,
@@ -104,22 +104,22 @@ class BoxIndexer(FsspecIndexer):
104
104
  index_config: BoxIndexerConfig
105
105
  connector_type: str = CONNECTOR_TYPE
106
106
 
107
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
108
- path = file_data["name"]
107
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
108
+ path = file_info["name"]
109
109
  date_created = None
110
110
  date_modified = None
111
- if modified_at_str := file_data.get("modified_at"):
111
+ if modified_at_str := file_info.get("modified_at"):
112
112
  date_modified = str(parser.parse(modified_at_str).timestamp())
113
- if created_at_str := file_data.get("created_at"):
113
+ if created_at_str := file_info.get("created_at"):
114
114
  date_created = str(parser.parse(created_at_str).timestamp())
115
115
 
116
- file_size = file_data.get("size") if "size" in file_data else None
116
+ file_size = file_info.get("size") if "size" in file_info else None
117
117
 
118
- version = file_data.get("id")
118
+ version = file_info.get("id")
119
119
  record_locator = {
120
120
  "protocol": self.index_config.protocol,
121
121
  "remote_file_path": self.index_config.remote_url,
122
- "file_id": file_data.get("id"),
122
+ "file_id": file_info.get("id"),
123
123
  }
124
124
  return FileDataSourceMetadata(
125
125
  date_created=date_created,
@@ -93,15 +93,15 @@ class DropboxIndexer(FsspecIndexer):
93
93
  index_config: DropboxIndexerConfig
94
94
  connector_type: str = CONNECTOR_TYPE
95
95
 
96
- def get_path(self, file_data: dict) -> str:
97
- return file_data["name"]
96
+ def get_path(self, file_info: dict) -> str:
97
+ return file_info["name"]
98
98
 
99
- def get_metadata(self, file_data: dict) -> FileDataSourceMetadata:
100
- path = file_data["name"].lstrip("/")
99
+ def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
100
+ path = file_info["name"].lstrip("/")
101
101
  date_created = None
102
102
  date_modified = None
103
- server_modified = file_data.get("server_modified")
104
- client_modified = file_data.get("client_modified")
103
+ server_modified = file_info.get("server_modified")
104
+ client_modified = file_info.get("client_modified")
105
105
  if server_modified and client_modified and server_modified > client_modified:
106
106
  date_created = str(client_modified.timestamp())
107
107
  date_modified = str(server_modified.timestamp())
@@ -109,13 +109,13 @@ class DropboxIndexer(FsspecIndexer):
109
109
  date_created = str(server_modified.timestamp())
110
110
  date_modified = str(client_modified.timestamp())
111
111
 
112
- file_size = file_data.get("size") if "size" in file_data else None
112
+ file_size = file_info.get("size") if "size" in file_info else None
113
113
 
114
- version = file_data.get("content_hash")
114
+ version = file_info.get("content_hash")
115
115
  record_locator = {
116
116
  "protocol": self.index_config.protocol,
117
117
  "remote_file_path": self.index_config.remote_url,
118
- "file_id": file_data.get("id"),
118
+ "file_id": file_info.get("id"),
119
119
  }
120
120
  return FileDataSourceMetadata(
121
121
  date_created=date_created,