unstructured-ingest 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/utils/validation/equality.py +2 -1
- test/unit/v2/connectors/sql/test_sql.py +4 -2
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/data_prep.py +11 -3
- unstructured_ingest/utils/html.py +109 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -13
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -4
- unstructured_ingest/v2/processes/connectors/confluence.py +95 -25
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +7 -7
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +9 -9
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +41 -9
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +7 -7
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +8 -8
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +4 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +12 -8
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +270 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/METADATA +25 -22
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/RECORD +30 -27
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.4.0.dist-info → unstructured_ingest-0.4.1.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
-
import ndjson
|
|
5
4
|
from bs4 import BeautifulSoup
|
|
6
5
|
from deepdiff import DeepDiff
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils import ndjson
|
|
8
|
+
|
|
8
9
|
|
|
9
10
|
def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
10
11
|
with expected_filepath.open() as f:
|
|
@@ -47,7 +47,9 @@ def test_run_output_filename_suffix(
|
|
|
47
47
|
mock_get_output_path = mocker.patch.object(
|
|
48
48
|
SQLUploadStager, "get_output_path", return_value=output_dir / expected
|
|
49
49
|
)
|
|
50
|
-
mock_write_output = mocker.patch
|
|
50
|
+
mock_write_output = mocker.patch(
|
|
51
|
+
"unstructured_ingest.v2.processes.connectors.sql.sql.write_data", return_value=None
|
|
52
|
+
)
|
|
51
53
|
|
|
52
54
|
# Act
|
|
53
55
|
result = mock_instance.run(
|
|
@@ -67,6 +69,6 @@ def test_run_output_filename_suffix(
|
|
|
67
69
|
mock_conform_dataframe.assert_called_once()
|
|
68
70
|
mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
|
|
69
71
|
mock_write_output.assert_called_once_with(
|
|
70
|
-
|
|
72
|
+
path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
|
|
71
73
|
)
|
|
72
74
|
assert result.name == expected
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.4.
|
|
1
|
+
__version__ = "0.4.1" # pragma: no cover
|
|
@@ -4,9 +4,9 @@ from datetime import datetime
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
6
6
|
|
|
7
|
-
import ndjson
|
|
8
7
|
import pandas as pd
|
|
9
8
|
|
|
9
|
+
from unstructured_ingest.utils import ndjson
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
11
11
|
|
|
12
12
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
@@ -153,6 +153,16 @@ def get_data_by_suffix(path: Path) -> list[dict]:
|
|
|
153
153
|
raise ValueError(f"Unsupported file type: {path}")
|
|
154
154
|
|
|
155
155
|
|
|
156
|
+
def write_data(path: Path, data: list[dict], indent: int = 2) -> None:
|
|
157
|
+
with path.open("w") as f:
|
|
158
|
+
if path.suffix == ".json":
|
|
159
|
+
json.dump(data, f, indent=indent, ensure_ascii=False)
|
|
160
|
+
elif path.suffix == ".ndjson":
|
|
161
|
+
ndjson.dump(data, f, ensure_ascii=False)
|
|
162
|
+
else:
|
|
163
|
+
raise IOError("Unsupported file type: {path}")
|
|
164
|
+
|
|
165
|
+
|
|
156
166
|
def get_data(path: Path) -> list[dict]:
|
|
157
167
|
try:
|
|
158
168
|
return get_data_by_suffix(path=path)
|
|
@@ -179,8 +189,6 @@ def get_data(path: Path) -> list[dict]:
|
|
|
179
189
|
except Exception as e:
|
|
180
190
|
logger.warning(f"failed to read {path} as parquet: {e}")
|
|
181
191
|
|
|
182
|
-
raise IOError(f"File could not be parsed: {path}")
|
|
183
|
-
|
|
184
192
|
|
|
185
193
|
def get_data_df(path: Path) -> pd.DataFrame:
|
|
186
194
|
with path.open() as f:
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
9
|
+
from requests import Session
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, SourceIdentifiers
|
|
12
|
+
from unstructured_ingest.v2.logger import logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def convert_image_tags(url: str, original_html: str, session: Optional[Session] = None) -> str:
|
|
16
|
+
session = session or requests.Session()
|
|
17
|
+
parsed_url = urlparse(url)
|
|
18
|
+
base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
|
19
|
+
soup = BeautifulSoup(original_html, "html.parser")
|
|
20
|
+
images = soup.find_all("img")
|
|
21
|
+
for image in images:
|
|
22
|
+
current_source = image["src"]
|
|
23
|
+
if current_source.startswith("//"):
|
|
24
|
+
source_url = f"{parsed_url.scheme}:{current_source}"
|
|
25
|
+
elif current_source.startswith("http"):
|
|
26
|
+
source_url = current_source
|
|
27
|
+
else:
|
|
28
|
+
source_url = base_url + current_source
|
|
29
|
+
try:
|
|
30
|
+
response = session.get(source_url)
|
|
31
|
+
response.raise_for_status()
|
|
32
|
+
image_content = response.content
|
|
33
|
+
logger.debug(
|
|
34
|
+
"img tag having src updated from {} to base64 content".format(image["src"])
|
|
35
|
+
)
|
|
36
|
+
image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.warning(
|
|
39
|
+
f"failed to download image content from {source_url}: {e}", exc_info=True
|
|
40
|
+
)
|
|
41
|
+
return str(soup)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def download_link(
|
|
45
|
+
download_dir: Path, link: str, session: Optional[Session] = None, force_download: bool = False
|
|
46
|
+
) -> Path:
|
|
47
|
+
session = session or requests.Session()
|
|
48
|
+
filename = Path(urlparse(url=link).path).name
|
|
49
|
+
download_path = download_dir / filename
|
|
50
|
+
logger.debug(f"downloading file from {link} to {download_path}")
|
|
51
|
+
if download_path.exists() and download_path.is_file() and not force_download:
|
|
52
|
+
return download_path
|
|
53
|
+
with download_path.open("wb") as downloaded_file:
|
|
54
|
+
response = session.get(link)
|
|
55
|
+
response.raise_for_status()
|
|
56
|
+
downloaded_file.write(response.content)
|
|
57
|
+
return download_path
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def download_embedded_files(
|
|
61
|
+
download_dir: Path,
|
|
62
|
+
original_filedata: FileData,
|
|
63
|
+
original_html: str,
|
|
64
|
+
session: Optional[Session] = None,
|
|
65
|
+
force_download: bool = False,
|
|
66
|
+
) -> list[DownloadResponse]:
|
|
67
|
+
session = session or requests.Session()
|
|
68
|
+
url = original_filedata.metadata.url
|
|
69
|
+
parsed_url = urlparse(url)
|
|
70
|
+
base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
|
71
|
+
soup = BeautifulSoup(original_html, "html.parser")
|
|
72
|
+
tags = soup.find_all("a", href=True)
|
|
73
|
+
hrefs = [
|
|
74
|
+
tag["href"]
|
|
75
|
+
for tag in tags
|
|
76
|
+
if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
|
|
77
|
+
]
|
|
78
|
+
results = []
|
|
79
|
+
for current_source in hrefs:
|
|
80
|
+
download_dir.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
if current_source.startswith("//"):
|
|
82
|
+
source_url = f"{parsed_url.scheme}:{current_source}"
|
|
83
|
+
elif current_source.startswith("http"):
|
|
84
|
+
source_url = current_source
|
|
85
|
+
else:
|
|
86
|
+
source_url = base_url + current_source
|
|
87
|
+
try:
|
|
88
|
+
downloaded_path = download_link(
|
|
89
|
+
download_dir=download_dir,
|
|
90
|
+
link=source_url,
|
|
91
|
+
session=session,
|
|
92
|
+
force_download=force_download,
|
|
93
|
+
)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.warning(f"failed to download file content from {source_url}: {e}")
|
|
96
|
+
continue
|
|
97
|
+
result_file_data = original_filedata.model_copy(deep=True)
|
|
98
|
+
result_file_data.metadata.url = source_url
|
|
99
|
+
result_file_data.metadata.record_locator["parent_url"] = url
|
|
100
|
+
result_file_data.identifier = str(
|
|
101
|
+
uuid5(NAMESPACE_DNS, source_url + original_filedata.identifier)
|
|
102
|
+
)
|
|
103
|
+
filename = Path(urlparse(url=source_url).path).name
|
|
104
|
+
result_file_data.source_identifiers = SourceIdentifiers(
|
|
105
|
+
filename=filename, fullpath=filename
|
|
106
|
+
)
|
|
107
|
+
result_file_data.local_download_path = downloaded_path.as_posix()
|
|
108
|
+
results.append(DownloadResponse(file_data=result_file_data, path=downloaded_path))
|
|
109
|
+
return results
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import IO, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def dumps(obj: list[dict[str, Any]], **kwargs) -> str:
|
|
6
|
+
return "\n".join(json.dumps(each, **kwargs) for each in obj)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def dump(obj: list[dict[str, Any]], fp: IO, **kwargs) -> None:
|
|
10
|
+
# Indent breaks ndjson formatting
|
|
11
|
+
kwargs["indent"] = None
|
|
12
|
+
text = dumps(obj, **kwargs)
|
|
13
|
+
fp.write(text)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def loads(s: str, **kwargs) -> list[dict[str, Any]]:
|
|
17
|
+
return [json.loads(line, **kwargs) for line in s.splitlines()]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load(fp: IO, **kwargs) -> list[dict[str, Any]]:
|
|
21
|
+
return loads(fp.read(), **kwargs)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class writer(object):
|
|
25
|
+
def __init__(self, f, **kwargs):
|
|
26
|
+
self.f = f
|
|
27
|
+
self.kwargs = kwargs
|
|
28
|
+
|
|
29
|
+
def write(self, row):
|
|
30
|
+
stringified = json.dumps(row, **self.kwargs)
|
|
31
|
+
self.f.write(stringified + "\n")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class reader(object):
|
|
35
|
+
def __init__(self, f, **kwargs):
|
|
36
|
+
self.f = f
|
|
37
|
+
self.kwargs = kwargs
|
|
38
|
+
|
|
39
|
+
def __iter__(self):
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
def __next__(self):
|
|
43
|
+
line = ""
|
|
44
|
+
|
|
45
|
+
while line == "":
|
|
46
|
+
line = next(self.f).strip()
|
|
47
|
+
|
|
48
|
+
return json.loads(line, **self.kwargs)
|
|
49
|
+
|
|
50
|
+
# NOTE: this is necessary to comply with py27
|
|
51
|
+
def next(self):
|
|
52
|
+
return self.__next__()
|
|
@@ -2,11 +2,11 @@ import json
|
|
|
2
2
|
from abc import ABC
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any,
|
|
5
|
+
from typing import Any, TypeVar
|
|
6
6
|
|
|
7
|
-
import ndjson
|
|
8
7
|
from pydantic import BaseModel
|
|
9
8
|
|
|
9
|
+
from unstructured_ingest.utils import ndjson
|
|
10
10
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
11
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
12
|
|
|
@@ -22,16 +22,6 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
|
22
22
|
class UploadStager(BaseProcess, ABC):
|
|
23
23
|
upload_stager_config: UploadStagerConfigT
|
|
24
24
|
|
|
25
|
-
def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
26
|
-
if output_path.suffix == ".json":
|
|
27
|
-
with output_path.open("w") as f:
|
|
28
|
-
json.dump(data, f, indent=indent)
|
|
29
|
-
elif output_path.suffix == ".ndjson":
|
|
30
|
-
with output_path.open("w") as f:
|
|
31
|
-
ndjson.dump(data, f)
|
|
32
|
-
else:
|
|
33
|
-
raise ValueError(f"Unsupported output format: {output_path}")
|
|
34
|
-
|
|
35
25
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
36
26
|
return element_dict
|
|
37
27
|
|
|
@@ -49,7 +39,7 @@ class UploadStager(BaseProcess, ABC):
|
|
|
49
39
|
writer = ndjson.writer(out_f)
|
|
50
40
|
for element in reader:
|
|
51
41
|
conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
|
|
52
|
-
writer.
|
|
42
|
+
writer.write(row=conformed_element)
|
|
53
43
|
writer.f.flush()
|
|
54
44
|
|
|
55
45
|
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional, TypedDict
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
9
|
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -44,9 +44,8 @@ class ChunkStep(PipelineStep):
|
|
|
44
44
|
return filepath
|
|
45
45
|
|
|
46
46
|
def _save_output(self, output_filepath: str, chunked_content: list[dict]):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
json.dump(chunked_content, f, indent=2)
|
|
47
|
+
logger.debug(f"writing chunker output to: {output_filepath}")
|
|
48
|
+
write_data(path=Path(output_filepath), data=chunked_content)
|
|
50
49
|
|
|
51
50
|
async def _run_async(
|
|
52
51
|
self, fn: Callable, path: str, file_data_path: str, **kwargs
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional, TypedDict
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
9
|
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -44,9 +44,8 @@ class EmbedStep(PipelineStep):
|
|
|
44
44
|
return filepath
|
|
45
45
|
|
|
46
46
|
def _save_output(self, output_filepath: str, embedded_content: list[dict]):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
json.dump(embedded_content, f, indent=2)
|
|
47
|
+
logger.debug(f"writing embedded output to: {output_filepath}")
|
|
48
|
+
write_data(path=Path(output_filepath), data=embedded_content)
|
|
50
49
|
|
|
51
50
|
async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
|
|
52
51
|
path = Path(path)
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional, TypedDict
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
9
|
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -44,9 +44,8 @@ class PartitionStep(PipelineStep):
|
|
|
44
44
|
return filepath
|
|
45
45
|
|
|
46
46
|
def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
json.dump(partitioned_content, f, indent=2)
|
|
47
|
+
logger.debug(f"writing partitioned output to: {output_filepath}")
|
|
48
|
+
write_data(path=Path(output_filepath), data=partitioned_content)
|
|
50
49
|
|
|
51
50
|
async def _run_async(
|
|
52
51
|
self, fn: Callable, path: str, file_data_path: str
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
1
2
|
from dataclasses import dataclass, field
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import TYPE_CHECKING, Generator, List, Optional
|
|
@@ -17,6 +18,7 @@ from unstructured_ingest.v2.interfaces import (
|
|
|
17
18
|
Indexer,
|
|
18
19
|
IndexerConfig,
|
|
19
20
|
SourceIdentifiers,
|
|
21
|
+
download_responses,
|
|
20
22
|
)
|
|
21
23
|
from unstructured_ingest.v2.logger import logger
|
|
22
24
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
@@ -71,17 +73,19 @@ class ConfluenceConnectionConfig(ConnectionConfig):
|
|
|
71
73
|
)
|
|
72
74
|
|
|
73
75
|
@requires_dependencies(["atlassian"], extras="confluence")
|
|
76
|
+
@contextmanager
|
|
74
77
|
def get_client(self) -> "Confluence":
|
|
75
78
|
from atlassian import Confluence
|
|
76
79
|
|
|
77
80
|
access_configs = self.access_config.get_secret_value()
|
|
78
|
-
|
|
81
|
+
with Confluence(
|
|
79
82
|
url=self.url,
|
|
80
83
|
username=self.username,
|
|
81
84
|
password=access_configs.password,
|
|
82
85
|
token=access_configs.token,
|
|
83
86
|
cloud=self.cloud,
|
|
84
|
-
)
|
|
87
|
+
) as client:
|
|
88
|
+
yield client
|
|
85
89
|
|
|
86
90
|
|
|
87
91
|
class ConfluenceIndexerConfig(IndexerConfig):
|
|
@@ -103,8 +107,8 @@ class ConfluenceIndexer(Indexer):
|
|
|
103
107
|
|
|
104
108
|
# Attempt to retrieve a list of spaces with limit=1.
|
|
105
109
|
# This should only succeed if all creds are valid
|
|
106
|
-
|
|
107
|
-
|
|
110
|
+
with self.connection_config.get_client() as client:
|
|
111
|
+
client.get_all_spaces(limit=1)
|
|
108
112
|
logger.info("Connection to Confluence successful.")
|
|
109
113
|
return True
|
|
110
114
|
except Exception as e:
|
|
@@ -116,21 +120,21 @@ class ConfluenceIndexer(Indexer):
|
|
|
116
120
|
if spaces:
|
|
117
121
|
return spaces
|
|
118
122
|
else:
|
|
119
|
-
|
|
120
|
-
|
|
123
|
+
with self.connection_config.get_client() as client:
|
|
124
|
+
all_spaces = client.get_all_spaces(limit=self.index_config.max_num_of_spaces)
|
|
121
125
|
space_ids = [space["key"] for space in all_spaces["results"]]
|
|
122
126
|
return space_ids
|
|
123
127
|
|
|
124
128
|
def _get_docs_ids_within_one_space(self, space_id: str) -> List[dict]:
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
129
|
+
with self.connection_config.get_client() as client:
|
|
130
|
+
pages = client.get_all_pages_from_space(
|
|
131
|
+
space=space_id,
|
|
132
|
+
start=0,
|
|
133
|
+
limit=self.index_config.max_num_of_docs_from_each_space,
|
|
134
|
+
expand=None,
|
|
135
|
+
content_type="page",
|
|
136
|
+
status=None,
|
|
137
|
+
)
|
|
134
138
|
doc_ids = [{"space_id": space_id, "doc_id": page["id"]} for page in pages]
|
|
135
139
|
return doc_ids
|
|
136
140
|
|
|
@@ -177,7 +181,18 @@ class ConfluenceIndexer(Indexer):
|
|
|
177
181
|
|
|
178
182
|
|
|
179
183
|
class ConfluenceDownloaderConfig(DownloaderConfig):
|
|
180
|
-
|
|
184
|
+
extract_images: bool = Field(
|
|
185
|
+
default=False,
|
|
186
|
+
description="if true, will download images and replace "
|
|
187
|
+
"the html content with base64 encoded images",
|
|
188
|
+
)
|
|
189
|
+
extract_files: bool = Field(
|
|
190
|
+
default=False, description="if true, will download any embedded files"
|
|
191
|
+
)
|
|
192
|
+
force_download: bool = Field(
|
|
193
|
+
default=False,
|
|
194
|
+
description="if true, will redownload extracted files even if they already exist locally",
|
|
195
|
+
)
|
|
181
196
|
|
|
182
197
|
|
|
183
198
|
@dataclass
|
|
@@ -186,14 +201,37 @@ class ConfluenceDownloader(Downloader):
|
|
|
186
201
|
download_config: ConfluenceDownloaderConfig = field(default_factory=ConfluenceDownloaderConfig)
|
|
187
202
|
connector_type: str = CONNECTOR_TYPE
|
|
188
203
|
|
|
189
|
-
def
|
|
204
|
+
def download_embedded_files(
|
|
205
|
+
self, session, html: str, current_file_data: FileData
|
|
206
|
+
) -> list[DownloadResponse]:
|
|
207
|
+
if not self.download_config.extract_files:
|
|
208
|
+
return []
|
|
209
|
+
from unstructured_ingest.utils.html import download_embedded_files
|
|
210
|
+
|
|
211
|
+
filepath = current_file_data.source_identifiers.relative_path
|
|
212
|
+
download_path = Path(self.download_dir) / filepath
|
|
213
|
+
download_dir = download_path.with_suffix("")
|
|
214
|
+
return download_embedded_files(
|
|
215
|
+
download_dir=download_dir,
|
|
216
|
+
original_filedata=current_file_data,
|
|
217
|
+
original_html=html,
|
|
218
|
+
session=session,
|
|
219
|
+
force_download=self.download_config.force_download,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def run(self, file_data: FileData, **kwargs) -> download_responses:
|
|
223
|
+
from bs4 import BeautifulSoup
|
|
224
|
+
|
|
225
|
+
from unstructured_ingest.utils.html import convert_image_tags
|
|
226
|
+
|
|
190
227
|
doc_id = file_data.identifier
|
|
191
228
|
try:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
229
|
+
with self.connection_config.get_client() as client:
|
|
230
|
+
page = client.get_page_by_id(
|
|
231
|
+
page_id=doc_id,
|
|
232
|
+
expand="history.lastUpdated,version,body.view",
|
|
233
|
+
)
|
|
234
|
+
|
|
197
235
|
except Exception as e:
|
|
198
236
|
logger.error(f"Failed to retrieve page with ID {doc_id}: {e}", exc_info=True)
|
|
199
237
|
raise SourceConnectionError(f"Failed to retrieve page with ID {doc_id}: {e}")
|
|
@@ -202,20 +240,52 @@ class ConfluenceDownloader(Downloader):
|
|
|
202
240
|
raise ValueError(f"Page with ID {doc_id} does not exist.")
|
|
203
241
|
|
|
204
242
|
content = page["body"]["view"]["value"]
|
|
243
|
+
# This supports v2 html parsing in unstructured
|
|
244
|
+
title = page["title"]
|
|
245
|
+
title_html = f"<title>{title}</title>"
|
|
246
|
+
content = f"<body class='Document' >{title_html}{content}</body>"
|
|
247
|
+
if self.download_config.extract_images:
|
|
248
|
+
with self.connection_config.get_client() as client:
|
|
249
|
+
content = convert_image_tags(
|
|
250
|
+
url=file_data.metadata.url, original_html=content, session=client._session
|
|
251
|
+
)
|
|
205
252
|
|
|
206
253
|
filepath = file_data.source_identifiers.relative_path
|
|
207
254
|
download_path = Path(self.download_dir) / filepath
|
|
208
255
|
download_path.parent.mkdir(parents=True, exist_ok=True)
|
|
209
256
|
with open(download_path, "w", encoding="utf8") as f:
|
|
210
|
-
|
|
257
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
258
|
+
f.write(soup.prettify())
|
|
211
259
|
|
|
212
260
|
# Update file_data with metadata
|
|
213
261
|
file_data.metadata.date_created = page["history"]["createdDate"]
|
|
214
262
|
file_data.metadata.date_modified = page["version"]["when"]
|
|
215
263
|
file_data.metadata.version = str(page["version"]["number"])
|
|
216
|
-
file_data.display_name =
|
|
264
|
+
file_data.display_name = title
|
|
217
265
|
|
|
218
|
-
|
|
266
|
+
download_response = self.generate_download_response(
|
|
267
|
+
file_data=file_data, download_path=download_path
|
|
268
|
+
)
|
|
269
|
+
if self.download_config.extract_files:
|
|
270
|
+
with self.connection_config.get_client() as client:
|
|
271
|
+
extracted_download_responses = self.download_embedded_files(
|
|
272
|
+
html=content,
|
|
273
|
+
current_file_data=download_response["file_data"],
|
|
274
|
+
session=client._session,
|
|
275
|
+
)
|
|
276
|
+
if extracted_download_responses:
|
|
277
|
+
for dr in extracted_download_responses:
|
|
278
|
+
fd = dr["file_data"]
|
|
279
|
+
source_file_path = Path(file_data.source_identifiers.fullpath).with_suffix(
|
|
280
|
+
""
|
|
281
|
+
)
|
|
282
|
+
new_fullpath = source_file_path / fd.source_identifiers.filename
|
|
283
|
+
fd.source_identifiers = SourceIdentifiers(
|
|
284
|
+
fullpath=new_fullpath.as_posix(), filename=new_fullpath.name
|
|
285
|
+
)
|
|
286
|
+
extracted_download_responses.append(download_response)
|
|
287
|
+
return extracted_download_responses
|
|
288
|
+
return download_response
|
|
219
289
|
|
|
220
290
|
|
|
221
291
|
confluence_source_entry = SourceRegistryEntry(
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from unstructured_ingest.utils.data_prep import get_data
|
|
7
|
+
from unstructured_ingest.utils.data_prep import get_data, write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData, UploadStager
|
|
9
9
|
from unstructured_ingest.v2.utils import get_enhanced_element_id
|
|
10
10
|
|
|
@@ -96,5 +96,5 @@ class BaseDuckDBUploadStager(UploadStager):
|
|
|
96
96
|
df[column] = df[column].apply(str)
|
|
97
97
|
|
|
98
98
|
data = df.to_dict(orient="records")
|
|
99
|
-
|
|
99
|
+
write_data(path=output_path, data=data)
|
|
100
100
|
return output_path
|
|
@@ -128,22 +128,22 @@ class AzureIndexer(FsspecIndexer):
|
|
|
128
128
|
def sterilize_info(self, file_data: dict) -> dict:
|
|
129
129
|
return sterilize_dict(data=file_data, default=azure_json_serial)
|
|
130
130
|
|
|
131
|
-
def get_metadata(self,
|
|
132
|
-
path =
|
|
131
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
132
|
+
path = file_info["name"]
|
|
133
133
|
date_created = (
|
|
134
|
-
str(
|
|
135
|
-
if "creation_time" in
|
|
134
|
+
str(file_info.get("creation_time").timestamp())
|
|
135
|
+
if "creation_time" in file_info
|
|
136
136
|
else None
|
|
137
137
|
)
|
|
138
138
|
date_modified = (
|
|
139
|
-
str(
|
|
140
|
-
if "last_modified" in
|
|
139
|
+
str(file_info.get("last_modified").timestamp())
|
|
140
|
+
if "last_modified" in file_info
|
|
141
141
|
else None
|
|
142
142
|
)
|
|
143
143
|
|
|
144
|
-
file_size =
|
|
144
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
145
145
|
|
|
146
|
-
version =
|
|
146
|
+
version = file_info.get("etag")
|
|
147
147
|
record_locator = {
|
|
148
148
|
"protocol": self.index_config.protocol,
|
|
149
149
|
"remote_file_path": self.index_config.remote_url,
|
|
@@ -104,22 +104,22 @@ class BoxIndexer(FsspecIndexer):
|
|
|
104
104
|
index_config: BoxIndexerConfig
|
|
105
105
|
connector_type: str = CONNECTOR_TYPE
|
|
106
106
|
|
|
107
|
-
def get_metadata(self,
|
|
108
|
-
path =
|
|
107
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
108
|
+
path = file_info["name"]
|
|
109
109
|
date_created = None
|
|
110
110
|
date_modified = None
|
|
111
|
-
if modified_at_str :=
|
|
111
|
+
if modified_at_str := file_info.get("modified_at"):
|
|
112
112
|
date_modified = str(parser.parse(modified_at_str).timestamp())
|
|
113
|
-
if created_at_str :=
|
|
113
|
+
if created_at_str := file_info.get("created_at"):
|
|
114
114
|
date_created = str(parser.parse(created_at_str).timestamp())
|
|
115
115
|
|
|
116
|
-
file_size =
|
|
116
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
117
117
|
|
|
118
|
-
version =
|
|
118
|
+
version = file_info.get("id")
|
|
119
119
|
record_locator = {
|
|
120
120
|
"protocol": self.index_config.protocol,
|
|
121
121
|
"remote_file_path": self.index_config.remote_url,
|
|
122
|
-
"file_id":
|
|
122
|
+
"file_id": file_info.get("id"),
|
|
123
123
|
}
|
|
124
124
|
return FileDataSourceMetadata(
|
|
125
125
|
date_created=date_created,
|
|
@@ -93,15 +93,15 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
93
93
|
index_config: DropboxIndexerConfig
|
|
94
94
|
connector_type: str = CONNECTOR_TYPE
|
|
95
95
|
|
|
96
|
-
def get_path(self,
|
|
97
|
-
return
|
|
96
|
+
def get_path(self, file_info: dict) -> str:
|
|
97
|
+
return file_info["name"]
|
|
98
98
|
|
|
99
|
-
def get_metadata(self,
|
|
100
|
-
path =
|
|
99
|
+
def get_metadata(self, file_info: dict) -> FileDataSourceMetadata:
|
|
100
|
+
path = file_info["name"].lstrip("/")
|
|
101
101
|
date_created = None
|
|
102
102
|
date_modified = None
|
|
103
|
-
server_modified =
|
|
104
|
-
client_modified =
|
|
103
|
+
server_modified = file_info.get("server_modified")
|
|
104
|
+
client_modified = file_info.get("client_modified")
|
|
105
105
|
if server_modified and client_modified and server_modified > client_modified:
|
|
106
106
|
date_created = str(client_modified.timestamp())
|
|
107
107
|
date_modified = str(server_modified.timestamp())
|
|
@@ -109,13 +109,13 @@ class DropboxIndexer(FsspecIndexer):
|
|
|
109
109
|
date_created = str(server_modified.timestamp())
|
|
110
110
|
date_modified = str(client_modified.timestamp())
|
|
111
111
|
|
|
112
|
-
file_size =
|
|
112
|
+
file_size = file_info.get("size") if "size" in file_info else None
|
|
113
113
|
|
|
114
|
-
version =
|
|
114
|
+
version = file_info.get("content_hash")
|
|
115
115
|
record_locator = {
|
|
116
116
|
"protocol": self.index_config.protocol,
|
|
117
117
|
"remote_file_path": self.index_config.remote_url,
|
|
118
|
-
"file_id":
|
|
118
|
+
"file_id": file_info.get("id"),
|
|
119
119
|
}
|
|
120
120
|
return FileDataSourceMetadata(
|
|
121
121
|
date_created=date_created,
|