unstructured-ingest 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/sql/test_databricks_delta_tables.py +10 -10
- test/integration/connectors/utils/validation/equality.py +2 -1
- test/unit/v2/connectors/databricks/__init__.py +0 -0
- test/unit/v2/connectors/databricks/test_volumes_table.py +44 -0
- test/unit/v2/connectors/sql/test_sql.py +4 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/data_prep.py +11 -3
- unstructured_ingest/utils/html.py +109 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -13
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -4
- unstructured_ingest/v2/processes/connectors/confluence.py +95 -25
- unstructured_ingest/v2/processes/connectors/databricks/volumes_table.py +14 -11
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +7 -7
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +9 -9
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +41 -9
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +7 -7
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +8 -8
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +4 -0
- unstructured_ingest/v2/processes/connectors/sql/databricks_delta_tables.py +15 -15
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +14 -7
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +270 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/METADATA +23 -20
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/RECORD +35 -30
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.2.dist-info}/top_level.txt +0 -0
|
@@ -17,11 +17,11 @@ from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers
|
|
|
17
17
|
from unstructured_ingest.v2.logger import logger
|
|
18
18
|
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
|
|
19
19
|
CONNECTOR_TYPE,
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
20
|
+
DatabricksDeltaTablesAccessConfig,
|
|
21
|
+
DatabricksDeltaTablesConnectionConfig,
|
|
22
|
+
DatabricksDeltaTablesUploader,
|
|
23
|
+
DatabricksDeltaTablesUploaderConfig,
|
|
24
|
+
DatabricksDeltaTablesUploadStager,
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
CATALOG = "utic-dev-tech-fixtures"
|
|
@@ -112,7 +112,7 @@ async def test_databricks_delta_tables_destination(
|
|
|
112
112
|
connector_type=CONNECTOR_TYPE,
|
|
113
113
|
source_identifiers=SourceIdentifiers(filename=upload_file.name, fullpath=upload_file.name),
|
|
114
114
|
)
|
|
115
|
-
stager =
|
|
115
|
+
stager = DatabricksDeltaTablesUploadStager()
|
|
116
116
|
staged_path = stager.run(
|
|
117
117
|
elements_filepath=upload_file,
|
|
118
118
|
file_data=mock_file_data,
|
|
@@ -122,15 +122,15 @@ async def test_databricks_delta_tables_destination(
|
|
|
122
122
|
|
|
123
123
|
assert staged_path.suffix == upload_file.suffix
|
|
124
124
|
|
|
125
|
-
uploader =
|
|
126
|
-
connection_config=
|
|
127
|
-
access_config=
|
|
125
|
+
uploader = DatabricksDeltaTablesUploader(
|
|
126
|
+
connection_config=DatabricksDeltaTablesConnectionConfig(
|
|
127
|
+
access_config=DatabricksDeltaTablesAccessConfig(
|
|
128
128
|
token=env_data.access_token.get_secret_value()
|
|
129
129
|
),
|
|
130
130
|
http_path=env_data.http_path,
|
|
131
131
|
server_hostname=env_data.server_hostname,
|
|
132
132
|
),
|
|
133
|
-
upload_config=
|
|
133
|
+
upload_config=DatabricksDeltaTablesUploaderConfig(
|
|
134
134
|
catalog=CATALOG, database="default", table_name=destination_table
|
|
135
135
|
),
|
|
136
136
|
)
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
-
import ndjson
|
|
5
4
|
from bs4 import BeautifulSoup
|
|
6
5
|
from deepdiff import DeepDiff
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils import ndjson
|
|
8
|
+
|
|
8
9
|
|
|
9
10
|
def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
10
11
|
with expected_filepath.open() as f:
|
|
File without changes
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from pytest_mock import MockerFixture
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.processes.connectors.databricks.volumes_table import (
|
|
7
|
+
DatabricksVolumeDeltaTableStager,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.fixture
|
|
12
|
+
def stager():
|
|
13
|
+
return DatabricksVolumeDeltaTableStager()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@pytest.mark.parametrize(
|
|
17
|
+
("output_path", "called_output_path"),
|
|
18
|
+
[
|
|
19
|
+
(
|
|
20
|
+
Path("/fake/path/output"),
|
|
21
|
+
Path("/fake/path/output.json"),
|
|
22
|
+
),
|
|
23
|
+
(
|
|
24
|
+
Path("/fake/path/output.ndjson"),
|
|
25
|
+
Path("/fake/path/output.json"),
|
|
26
|
+
),
|
|
27
|
+
],
|
|
28
|
+
)
|
|
29
|
+
def test_write_output(
|
|
30
|
+
mocker: MockerFixture,
|
|
31
|
+
stager: DatabricksVolumeDeltaTableStager,
|
|
32
|
+
output_path: Path,
|
|
33
|
+
called_output_path: Path,
|
|
34
|
+
):
|
|
35
|
+
data = [{"key1": "value1", "key2": "value2"}]
|
|
36
|
+
|
|
37
|
+
mock_get_data = mocker.patch(
|
|
38
|
+
"unstructured_ingest.v2.processes.connectors.databricks.volumes_table.write_data",
|
|
39
|
+
return_value=None,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
stager.write_output(output_path, data)
|
|
43
|
+
|
|
44
|
+
mock_get_data.assert_called_once_with(path=called_output_path, data=data, indent=None)
|
|
@@ -47,7 +47,9 @@ def test_run_output_filename_suffix(
|
|
|
47
47
|
mock_get_output_path = mocker.patch.object(
|
|
48
48
|
SQLUploadStager, "get_output_path", return_value=output_dir / expected
|
|
49
49
|
)
|
|
50
|
-
mock_write_output = mocker.patch
|
|
50
|
+
mock_write_output = mocker.patch(
|
|
51
|
+
"unstructured_ingest.v2.processes.connectors.sql.sql.write_data", return_value=None
|
|
52
|
+
)
|
|
51
53
|
|
|
52
54
|
# Act
|
|
53
55
|
result = mock_instance.run(
|
|
@@ -67,6 +69,6 @@ def test_run_output_filename_suffix(
|
|
|
67
69
|
mock_conform_dataframe.assert_called_once()
|
|
68
70
|
mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
|
|
69
71
|
mock_write_output.assert_called_once_with(
|
|
70
|
-
|
|
72
|
+
path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
|
|
71
73
|
)
|
|
72
74
|
assert result.name == expected
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.4.
|
|
1
|
+
__version__ = "0.4.2" # pragma: no cover
|
|
@@ -4,9 +4,9 @@ from datetime import datetime
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
6
6
|
|
|
7
|
-
import ndjson
|
|
8
7
|
import pandas as pd
|
|
9
8
|
|
|
9
|
+
from unstructured_ingest.utils import ndjson
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
11
11
|
|
|
12
12
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
@@ -153,6 +153,16 @@ def get_data_by_suffix(path: Path) -> list[dict]:
|
|
|
153
153
|
raise ValueError(f"Unsupported file type: {path}")
|
|
154
154
|
|
|
155
155
|
|
|
156
|
+
def write_data(path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
157
|
+
with path.open("w") as f:
|
|
158
|
+
if path.suffix == ".json":
|
|
159
|
+
json.dump(data, f, indent=indent, ensure_ascii=False)
|
|
160
|
+
elif path.suffix == ".ndjson":
|
|
161
|
+
ndjson.dump(data, f, ensure_ascii=False)
|
|
162
|
+
else:
|
|
163
|
+
raise IOError("Unsupported file type: {path}")
|
|
164
|
+
|
|
165
|
+
|
|
156
166
|
def get_data(path: Path) -> list[dict]:
|
|
157
167
|
try:
|
|
158
168
|
return get_data_by_suffix(path=path)
|
|
@@ -179,8 +189,6 @@ def get_data(path: Path) -> list[dict]:
|
|
|
179
189
|
except Exception as e:
|
|
180
190
|
logger.warning(f"failed to read {path} as parquet: {e}")
|
|
181
191
|
|
|
182
|
-
raise IOError(f"File could not be parsed: {path}")
|
|
183
|
-
|
|
184
192
|
|
|
185
193
|
def get_data_df(path: Path) -> pd.DataFrame:
|
|
186
194
|
with path.open() as f:
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
9
|
+
from requests import Session
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, SourceIdentifiers
|
|
12
|
+
from unstructured_ingest.v2.logger import logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def convert_image_tags(url: str, original_html: str, session: Optional[Session] = None) -> str:
|
|
16
|
+
session = session or requests.Session()
|
|
17
|
+
parsed_url = urlparse(url)
|
|
18
|
+
base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
|
19
|
+
soup = BeautifulSoup(original_html, "html.parser")
|
|
20
|
+
images = soup.find_all("img")
|
|
21
|
+
for image in images:
|
|
22
|
+
current_source = image["src"]
|
|
23
|
+
if current_source.startswith("//"):
|
|
24
|
+
source_url = f"{parsed_url.scheme}:{current_source}"
|
|
25
|
+
elif current_source.startswith("http"):
|
|
26
|
+
source_url = current_source
|
|
27
|
+
else:
|
|
28
|
+
source_url = base_url + current_source
|
|
29
|
+
try:
|
|
30
|
+
response = session.get(source_url)
|
|
31
|
+
response.raise_for_status()
|
|
32
|
+
image_content = response.content
|
|
33
|
+
logger.debug(
|
|
34
|
+
"img tag having src updated from {} to base64 content".format(image["src"])
|
|
35
|
+
)
|
|
36
|
+
image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.warning(
|
|
39
|
+
f"failed to download image content from {source_url}: {e}", exc_info=True
|
|
40
|
+
)
|
|
41
|
+
return str(soup)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def download_link(
|
|
45
|
+
download_dir: Path, link: str, session: Optional[Session] = None, force_download: bool = False
|
|
46
|
+
) -> Path:
|
|
47
|
+
session = session or requests.Session()
|
|
48
|
+
filename = Path(urlparse(url=link).path).name
|
|
49
|
+
download_path = download_dir / filename
|
|
50
|
+
logger.debug(f"downloading file from {link} to {download_path}")
|
|
51
|
+
if download_path.exists() and download_path.is_file() and not force_download:
|
|
52
|
+
return download_path
|
|
53
|
+
with download_path.open("wb") as downloaded_file:
|
|
54
|
+
response = session.get(link)
|
|
55
|
+
response.raise_for_status()
|
|
56
|
+
downloaded_file.write(response.content)
|
|
57
|
+
return download_path
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def download_embedded_files(
|
|
61
|
+
download_dir: Path,
|
|
62
|
+
original_filedata: FileData,
|
|
63
|
+
original_html: str,
|
|
64
|
+
session: Optional[Session] = None,
|
|
65
|
+
force_download: bool = False,
|
|
66
|
+
) -> list[DownloadResponse]:
|
|
67
|
+
session = session or requests.Session()
|
|
68
|
+
url = original_filedata.metadata.url
|
|
69
|
+
parsed_url = urlparse(url)
|
|
70
|
+
base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
|
71
|
+
soup = BeautifulSoup(original_html, "html.parser")
|
|
72
|
+
tags = soup.find_all("a", href=True)
|
|
73
|
+
hrefs = [
|
|
74
|
+
tag["href"]
|
|
75
|
+
for tag in tags
|
|
76
|
+
if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
|
|
77
|
+
]
|
|
78
|
+
results = []
|
|
79
|
+
for current_source in hrefs:
|
|
80
|
+
download_dir.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
if current_source.startswith("//"):
|
|
82
|
+
source_url = f"{parsed_url.scheme}:{current_source}"
|
|
83
|
+
elif current_source.startswith("http"):
|
|
84
|
+
source_url = current_source
|
|
85
|
+
else:
|
|
86
|
+
source_url = base_url + current_source
|
|
87
|
+
try:
|
|
88
|
+
downloaded_path = download_link(
|
|
89
|
+
download_dir=download_dir,
|
|
90
|
+
link=source_url,
|
|
91
|
+
session=session,
|
|
92
|
+
force_download=force_download,
|
|
93
|
+
)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.warning(f"failed to download file content from {source_url}: {e}")
|
|
96
|
+
continue
|
|
97
|
+
result_file_data = original_filedata.model_copy(deep=True)
|
|
98
|
+
result_file_data.metadata.url = source_url
|
|
99
|
+
result_file_data.metadata.record_locator["parent_url"] = url
|
|
100
|
+
result_file_data.identifier = str(
|
|
101
|
+
uuid5(NAMESPACE_DNS, source_url + original_filedata.identifier)
|
|
102
|
+
)
|
|
103
|
+
filename = Path(urlparse(url=source_url).path).name
|
|
104
|
+
result_file_data.source_identifiers = SourceIdentifiers(
|
|
105
|
+
filename=filename, fullpath=filename
|
|
106
|
+
)
|
|
107
|
+
result_file_data.local_download_path = downloaded_path.as_posix()
|
|
108
|
+
results.append(DownloadResponse(file_data=result_file_data, path=downloaded_path))
|
|
109
|
+
return results
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import IO, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def dumps(obj: list[dict[str, Any]], **kwargs) -> str:
|
|
6
|
+
return "\n".join(json.dumps(each, **kwargs) for each in obj)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def dump(obj: list[dict[str, Any]], fp: IO, **kwargs) -> None:
|
|
10
|
+
# Indent breaks ndjson formatting
|
|
11
|
+
kwargs["indent"] = None
|
|
12
|
+
text = dumps(obj, **kwargs)
|
|
13
|
+
fp.write(text)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def loads(s: str, **kwargs) -> list[dict[str, Any]]:
|
|
17
|
+
return [json.loads(line, **kwargs) for line in s.splitlines()]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load(fp: IO, **kwargs) -> list[dict[str, Any]]:
|
|
21
|
+
return loads(fp.read(), **kwargs)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class writer(object):
|
|
25
|
+
def __init__(self, f, **kwargs):
|
|
26
|
+
self.f = f
|
|
27
|
+
self.kwargs = kwargs
|
|
28
|
+
|
|
29
|
+
def write(self, row):
|
|
30
|
+
stringified = json.dumps(row, **self.kwargs)
|
|
31
|
+
self.f.write(stringified + "\n")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class reader(object):
|
|
35
|
+
def __init__(self, f, **kwargs):
|
|
36
|
+
self.f = f
|
|
37
|
+
self.kwargs = kwargs
|
|
38
|
+
|
|
39
|
+
def __iter__(self):
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
def __next__(self):
|
|
43
|
+
line = ""
|
|
44
|
+
|
|
45
|
+
while line == "":
|
|
46
|
+
line = next(self.f).strip()
|
|
47
|
+
|
|
48
|
+
return json.loads(line, **self.kwargs)
|
|
49
|
+
|
|
50
|
+
# NOTE: this is necessary to comply with py27
|
|
51
|
+
def next(self):
|
|
52
|
+
return self.__next__()
|
|
@@ -2,11 +2,11 @@ import json
|
|
|
2
2
|
from abc import ABC
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any,
|
|
5
|
+
from typing import Any, TypeVar
|
|
6
6
|
|
|
7
|
-
import ndjson
|
|
8
7
|
from pydantic import BaseModel
|
|
9
8
|
|
|
9
|
+
from unstructured_ingest.utils import ndjson
|
|
10
10
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
11
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
12
|
|
|
@@ -22,16 +22,6 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
|
22
22
|
class UploadStager(BaseProcess, ABC):
|
|
23
23
|
upload_stager_config: UploadStagerConfigT
|
|
24
24
|
|
|
25
|
-
def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
26
|
-
if output_path.suffix == ".json":
|
|
27
|
-
with output_path.open("w") as f:
|
|
28
|
-
json.dump(data, f, indent=indent)
|
|
29
|
-
elif output_path.suffix == ".ndjson":
|
|
30
|
-
with output_path.open("w") as f:
|
|
31
|
-
ndjson.dump(data, f)
|
|
32
|
-
else:
|
|
33
|
-
raise ValueError(f"Unsupported output format: {output_path}")
|
|
34
|
-
|
|
35
25
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
36
26
|
return element_dict
|
|
37
27
|
|
|
@@ -49,7 +39,7 @@ class UploadStager(BaseProcess, ABC):
|
|
|
49
39
|
writer = ndjson.writer(out_f)
|
|
50
40
|
for element in reader:
|
|
51
41
|
conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
|
|
52
|
-
writer.
|
|
42
|
+
writer.write(row=conformed_element)
|
|
53
43
|
writer.f.flush()
|
|
54
44
|
|
|
55
45
|
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional, TypedDict
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
9
|
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -44,9 +44,8 @@ class ChunkStep(PipelineStep):
|
|
|
44
44
|
return filepath
|
|
45
45
|
|
|
46
46
|
def _save_output(self, output_filepath: str, chunked_content: list[dict]):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
json.dump(chunked_content, f, indent=2)
|
|
47
|
+
logger.debug(f"writing chunker output to: {output_filepath}")
|
|
48
|
+
write_data(path=Path(output_filepath), data=chunked_content)
|
|
50
49
|
|
|
51
50
|
async def _run_async(
|
|
52
51
|
self, fn: Callable, path: str, file_data_path: str, **kwargs
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional, TypedDict
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
9
|
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -44,9 +44,8 @@ class EmbedStep(PipelineStep):
|
|
|
44
44
|
return filepath
|
|
45
45
|
|
|
46
46
|
def _save_output(self, output_filepath: str, embedded_content: list[dict]):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
json.dump(embedded_content, f, indent=2)
|
|
47
|
+
logger.debug(f"writing embedded output to: {output_filepath}")
|
|
48
|
+
write_data(path=Path(output_filepath), data=embedded_content)
|
|
50
49
|
|
|
51
50
|
async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
|
|
52
51
|
path = Path(path)
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional, TypedDict
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
9
|
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -44,9 +44,8 @@ class PartitionStep(PipelineStep):
|
|
|
44
44
|
return filepath
|
|
45
45
|
|
|
46
46
|
def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
json.dump(partitioned_content, f, indent=2)
|
|
47
|
+
logger.debug(f"writing partitioned output to: {output_filepath}")
|
|
48
|
+
write_data(path=Path(output_filepath), data=partitioned_content)
|
|
50
49
|
|
|
51
50
|
async def _run_async(
|
|
52
51
|
self, fn: Callable, path: str, file_data_path: str
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
1
2
|
from dataclasses import dataclass, field
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import TYPE_CHECKING, Generator, List, Optional
|
|
@@ -17,6 +18,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
17
18
|
Indexer,
|
|
18
19
|
IndexerConfig,
|
|
19
20
|
SourceIdentifiers,
|
|
21
|
+
download_responses,
|
|
20
22
|
)
|
|
21
23
|
from unstructured_ingest.v2.logger import logger
|
|
22
24
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -71,17 +73,19 @@ class ConfluenceConnectionConfig(ConnectionConfig):
|
|
|
71
73
|
)
|
|
72
74
|
|
|
73
75
|
@requires_dependencies(["atlassian"], extras="confluence")
|
|
76
|
+
@contextmanager
|
|
74
77
|
def get_client(self) -> "Confluence":
|
|
75
78
|
from atlassian import Confluence
|
|
76
79
|
|
|
77
80
|
access_configs = self.access_config.get_secret_value()
|
|
78
|
-
|
|
81
|
+
with Confluence(
|
|
79
82
|
url=self.url,
|
|
80
83
|
username=self.username,
|
|
81
84
|
password=access_configs.password,
|
|
82
85
|
token=access_configs.token,
|
|
83
86
|
cloud=self.cloud,
|
|
84
|
-
)
|
|
87
|
+
) as client:
|
|
88
|
+
yield client
|
|
85
89
|
|
|
86
90
|
|
|
87
91
|
class ConfluenceIndexerConfig(IndexerConfig):
|
|
@@ -103,8 +107,8 @@ class ConfluenceIndexer(Indexer):
|
|
|
103
107
|
|
|
104
108
|
# Attempt to retrieve a list of spaces with limit=1.
|
|
105
109
|
# This should only succeed if all creds are valid
|
|
106
|
-
|
|
107
|
-
|
|
110
|
+
with self.connection_config.get_client() as client:
|
|
111
|
+
client.get_all_spaces(limit=1)
|
|
108
112
|
logger.info("Connection to Confluence successful.")
|
|
109
113
|
return True
|
|
110
114
|
except Exception as e:
|
|
@@ -116,21 +120,21 @@ class ConfluenceIndexer(Indexer):
|
|
|
116
120
|
if spaces:
|
|
117
121
|
return spaces
|
|
118
122
|
else:
|
|
119
|
-
|
|
120
|
-
|
|
123
|
+
with self.connection_config.get_client() as client:
|
|
124
|
+
all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
|
|
121
125
|
space_ids = [space["key"] for space in all_spaces["results"]]
|
|
122
126
|
return space_ids
|
|
123
127
|
|
|
124
128
|
def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
129
|
+
with self.connection_config.get_client() as client:
|
|
130
|
+
pages = client.get_all_pages_from_space(
|
|
131
|
+
space=space_id,
|
|
132
|
+
start=0,
|
|
133
|
+
limit=self.index_config.max_num_of_docs_from_each_space,
|
|
134
|
+
expand=None,
|
|
135
|
+
content_type="page",
|
|
136
|
+
status=None,
|
|
137
|
+
)
|
|
134
138
|
doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
|
|
135
139
|
return doc_ids
|
|
136
140
|
|
|
@@ -177,7 +181,18 @@ class ConfluenceIndexer(Indexer):
|
|
|
177
181
|
|
|
178
182
|
|
|
179
183
|
class ConfluenceDownloaderConfig(DownloaderConfig):
|
|
180
|
-
|
|
184
|
+
extract_images: bool = Field(
|
|
185
|
+
default=False,
|
|
186
|
+
description="if true, will download images and replace "
|
|
187
|
+
"the html content with base64 encoded images",
|
|
188
|
+
)
|
|
189
|
+
extract_files: bool = Field(
|
|
190
|
+
default=False, description="if true, will download any embedded files"
|
|
191
|
+
)
|
|
192
|
+
force_download: bool = Field(
|
|
193
|
+
default=False,
|
|
194
|
+
description="if true, will redownload extracted files even if they already exist locally",
|
|
195
|
+
)
|
|
181
196
|
|
|
182
197
|
|
|
183
198
|
@dataclass
|
|
@@ -186,14 +201,37 @@ class ConfluenceDownloader(Downloader):
|
|
|
186
201
|
download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
|
|
187
202
|
connector_type: str = CONNECTOR_TYPE
|
|
188
203
|
|
|
189
|
-
def
|
|
204
|
+
def download_embedded_files(
|
|
205
|
+
self, session, html: str, current_file_data: FileData
|
|
206
|
+
) -> list[DownloadResponse]:
|
|
207
|
+
if not self.download_config.extract_files:
|
|
208
|
+
return []
|
|
209
|
+
from unstructured_ingest.utils.html import download_embedded_files
|
|
210
|
+
|
|
211
|
+
filepath = current_file_data.source_identifiers.relative_path
|
|
212
|
+
download_path = Path(self.download_dir) / filepath
|
|
213
|
+
download_dir = download_path.with_suffix("")
|
|
214
|
+
return download_embedded_files(
|
|
215
|
+
download_dir=download_dir,
|
|
216
|
+
original_filedata=current_file_data,
|
|
217
|
+
original_html=html,
|
|
218
|
+
session=session,
|
|
219
|
+
force_download=self.download_config.force_download,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def run(self, file_data: FileData, **kwargs) -> download_responses:
|
|
223
|
+
from bs4 import BeautifulSoup
|
|
224
|
+
|
|
225
|
+
from unstructured_ingest.utils.html import convert_image_tags
|
|
226
|
+
|
|
190
227
|
doc_id = file_data.identifier
|
|
191
228
|
try:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
229
|
+
with self.connection_config.get_client() as client:
|
|
230
|
+
page = client.get_page_by_id(
|
|
231
|
+
page_id=doc_id,
|
|
232
|
+
expand="history.lastUpdated,version,body.view",
|
|
233
|
+
)
|
|
234
|
+
|
|
197
235
|
except Exception as e:
|
|
198
236
|
logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
|
|
199
237
|
raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
|
|
@@ -202,20 +240,52 @@ class ConfluenceDownloader(Downloader):
|
|
|
202
240
|
raise ValueError(f"Page with ID {doc_id} does not exist.")
|
|
203
241
|
|
|
204
242
|
content = page["body"]["view"]["value"]
|
|
243
|
+
# This supports v2 html parsing in unstructured
|
|
244
|
+
title = page["title"]
|
|
245
|
+
title_html = f"<title>{title}</title>"
|
|
246
|
+
content = f"<body class='Document' >{title_html}{content}</body>"
|
|
247
|
+
if self.download_config.extract_images:
|
|
248
|
+
with self.connection_config.get_client() as client:
|
|
249
|
+
content = convert_image_tags(
|
|
250
|
+
url=file_data.metadata.url, original_html=content, session=client._session
|
|
251
|
+
)
|
|
205
252
|
|
|
206
253
|
filepath = file_data.source_identifiers.relative_path
|
|
207
254
|
download_path = Path(self.download_dir) / filepath
|
|
208
255
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
209
256
|
with open(download_path, "w", encoding="utf8") as f:
|
|
210
|
-
|
|
257
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
258
|
+
f.write(soup.prettify())
|
|
211
259
|
|
|
212
260
|
# Update file_data with metadata
|
|
213
261
|
file_data.metadata.date_created = page["history"]["createdDate"]
|
|
214
262
|
file_data.metadata.date_modified = page["version"]["when"]
|
|
215
263
|
file_data.metadata.version = str(page["version"]["number"])
|
|
216
|
-
file_data.display_name =
|
|
264
|
+
file_data.display_name = title
|
|
217
265
|
|
|
218
|
-
|
|
266
|
+
download_response = self.generate_download_response(
|
|
267
|
+
file_data=file_data, download_path=download_path
|
|
268
|
+
)
|
|
269
|
+
if self.download_config.extract_files:
|
|
270
|
+
with self.connection_config.get_client() as client:
|
|
271
|
+
extracted_download_responses = self.download_embedded_files(
|
|
272
|
+
html=content,
|
|
273
|
+
current_file_data=download_response["file_data"],
|
|
274
|
+
session=client._session,
|
|
275
|
+
)
|
|
276
|
+
if extracted_download_responses:
|
|
277
|
+
for dr in extracted_download_responses:
|
|
278
|
+
fd = dr["file_data"]
|
|
279
|
+
source_file_path = Path(file_data.source_identifiers.fullpath).with_suffix(
|
|
280
|
+
""
|
|
281
|
+
)
|
|
282
|
+
new_fullpath = source_file_path / fd.source_identifiers.filename
|
|
283
|
+
fd.source_identifiers = SourceIdentifiers(
|
|
284
|
+
fullpath=new_fullpath.as_posix(), filename=new_fullpath.name
|
|
285
|
+
)
|
|
286
|
+
extracted_download_responses.append(download_response)
|
|
287
|
+
return extracted_download_responses
|
|
288
|
+
return download_response
|
|
219
289
|
|
|
220
290
|
|
|
221
291
|
confluence_source_entry = SourceRegistryEntry(
|