unstructured-ingest 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- test/integration/connectors/test_confluence.py +4 -4
- test/integration/connectors/utils/validation/equality.py +2 -1
- test/unit/v2/connectors/sql/__init__.py +0 -0
- test/unit/v2/connectors/sql/test_sql.py +74 -0
- test/unit/v2/connectors/test_confluence.py +6 -6
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/utils/data_prep.py +11 -3
- unstructured_ingest/utils/html.py +109 -0
- unstructured_ingest/utils/ndjson.py +52 -0
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -13
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -4
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -4
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -4
- unstructured_ingest/v2/processes/connectors/confluence.py +125 -35
- unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -2
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -8
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +7 -7
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +9 -9
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +41 -9
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +7 -7
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +8 -8
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +5 -5
- unstructured_ingest/v2/processes/connectors/sql/__init__.py +4 -0
- unstructured_ingest/v2/processes/connectors/sql/singlestore.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
- unstructured_ingest/v2/processes/connectors/sql/sql.py +14 -8
- unstructured_ingest/v2/processes/connectors/sql/sqlite.py +2 -1
- unstructured_ingest/v2/processes/connectors/sql/vastdb.py +270 -0
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/METADATA +18 -15
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/RECORD +34 -29
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/top_level.txt +0 -0
|
@@ -30,10 +30,10 @@ async def test_confluence_source(temp_dir):
|
|
|
30
30
|
spaces = ["testteamsp", "MFS"]
|
|
31
31
|
|
|
32
32
|
# Create connection and indexer configurations
|
|
33
|
-
access_config = ConfluenceAccessConfig(
|
|
33
|
+
access_config = ConfluenceAccessConfig(password=api_token)
|
|
34
34
|
connection_config = ConfluenceConnectionConfig(
|
|
35
35
|
url=confluence_url,
|
|
36
|
-
|
|
36
|
+
username=user_email,
|
|
37
37
|
access_config=access_config,
|
|
38
38
|
)
|
|
39
39
|
index_config = ConfluenceIndexerConfig(
|
|
@@ -77,10 +77,10 @@ async def test_confluence_source_large(temp_dir):
|
|
|
77
77
|
spaces = ["testteamsp1"]
|
|
78
78
|
|
|
79
79
|
# Create connection and indexer configurations
|
|
80
|
-
access_config = ConfluenceAccessConfig(
|
|
80
|
+
access_config = ConfluenceAccessConfig(password=api_token)
|
|
81
81
|
connection_config = ConfluenceConnectionConfig(
|
|
82
82
|
url=confluence_url,
|
|
83
|
-
|
|
83
|
+
username=user_email,
|
|
84
84
|
access_config=access_config,
|
|
85
85
|
)
|
|
86
86
|
index_config = ConfluenceIndexerConfig(
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
|
-
import ndjson
|
|
5
4
|
from bs4 import BeautifulSoup
|
|
6
5
|
from deepdiff import DeepDiff
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils import ndjson
|
|
8
|
+
|
|
8
9
|
|
|
9
10
|
def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
|
|
10
11
|
with expected_filepath.open() as f:
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from pytest_mock import MockerFixture
|
|
5
|
+
|
|
6
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
|
|
7
|
+
from unstructured_ingest.v2.processes.connectors.sql.sql import SQLUploadStager
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pytest.fixture
|
|
11
|
+
def mock_instance() -> SQLUploadStager:
|
|
12
|
+
return SQLUploadStager()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.mark.parametrize(
|
|
16
|
+
("input_filepath", "output_filename", "expected"),
|
|
17
|
+
[
|
|
18
|
+
(
|
|
19
|
+
"/path/to/input_file.ndjson",
|
|
20
|
+
"output_file.ndjson",
|
|
21
|
+
"output_file.ndjson",
|
|
22
|
+
),
|
|
23
|
+
("input_file.txt", "output_file.json", "output_file.txt"),
|
|
24
|
+
("/path/to/input_file.json", "output_file", "output_file.json"),
|
|
25
|
+
],
|
|
26
|
+
)
|
|
27
|
+
def test_run_output_filename_suffix(
|
|
28
|
+
mocker: MockerFixture,
|
|
29
|
+
mock_instance: SQLUploadStager,
|
|
30
|
+
input_filepath: str,
|
|
31
|
+
output_filename: str,
|
|
32
|
+
expected: str,
|
|
33
|
+
):
|
|
34
|
+
output_dir = Path("/tmp/test/output_dir")
|
|
35
|
+
|
|
36
|
+
# Mocks
|
|
37
|
+
mock_get_data = mocker.patch(
|
|
38
|
+
"unstructured_ingest.v2.processes.connectors.sql.sql.get_data",
|
|
39
|
+
return_value=[{"key": "value"}, {"key": "value2"}],
|
|
40
|
+
)
|
|
41
|
+
mock_conform_dict = mocker.patch.object(
|
|
42
|
+
SQLUploadStager, "conform_dict", side_effect=lambda element_dict, file_data: element_dict
|
|
43
|
+
)
|
|
44
|
+
mock_conform_dataframe = mocker.patch.object(
|
|
45
|
+
SQLUploadStager, "conform_dataframe", side_effect=lambda df: df
|
|
46
|
+
)
|
|
47
|
+
mock_get_output_path = mocker.patch.object(
|
|
48
|
+
SQLUploadStager, "get_output_path", return_value=output_dir / expected
|
|
49
|
+
)
|
|
50
|
+
mock_write_output = mocker.patch(
|
|
51
|
+
"unstructured_ingest.v2.processes.connectors.sql.sql.write_data", return_value=None
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Act
|
|
55
|
+
result = mock_instance.run(
|
|
56
|
+
elements_filepath=Path(input_filepath),
|
|
57
|
+
file_data=FileData(
|
|
58
|
+
identifier="test",
|
|
59
|
+
connector_type="test",
|
|
60
|
+
source_identifiers=SourceIdentifiers(filename=input_filepath, fullpath=input_filepath),
|
|
61
|
+
),
|
|
62
|
+
output_dir=output_dir,
|
|
63
|
+
output_filename=output_filename,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Assert
|
|
67
|
+
mock_get_data.assert_called_once_with(path=Path(input_filepath))
|
|
68
|
+
assert mock_conform_dict.call_count == 2
|
|
69
|
+
mock_conform_dataframe.assert_called_once()
|
|
70
|
+
mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
|
|
71
|
+
mock_write_output.assert_called_once_with(
|
|
72
|
+
path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
|
|
73
|
+
)
|
|
74
|
+
assert result.name == expected
|
|
@@ -11,10 +11,10 @@ def test_connection_config_multiple_auth():
|
|
|
11
11
|
with pytest.raises(ValidationError):
|
|
12
12
|
ConfluenceConnectionConfig(
|
|
13
13
|
access_config=ConfluenceAccessConfig(
|
|
14
|
-
|
|
15
|
-
|
|
14
|
+
password="api_token",
|
|
15
|
+
token="access_token",
|
|
16
16
|
),
|
|
17
|
-
|
|
17
|
+
username="user_email",
|
|
18
18
|
url="url",
|
|
19
19
|
)
|
|
20
20
|
|
|
@@ -26,14 +26,14 @@ def test_connection_config_no_auth():
|
|
|
26
26
|
|
|
27
27
|
def test_connection_config_basic_auth():
|
|
28
28
|
ConfluenceConnectionConfig(
|
|
29
|
-
access_config=ConfluenceAccessConfig(
|
|
29
|
+
access_config=ConfluenceAccessConfig(password="api_token"),
|
|
30
30
|
url="url",
|
|
31
|
-
|
|
31
|
+
username="user_email",
|
|
32
32
|
)
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
def test_connection_config_pat_auth():
|
|
36
36
|
ConfluenceConnectionConfig(
|
|
37
|
-
access_config=ConfluenceAccessConfig(
|
|
37
|
+
access_config=ConfluenceAccessConfig(token="access_token"),
|
|
38
38
|
url="url",
|
|
39
39
|
)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.4.1" # pragma: no cover
|
|
@@ -4,9 +4,9 @@ from datetime import datetime
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
|
|
6
6
|
|
|
7
|
-
import ndjson
|
|
8
7
|
import pandas as pd
|
|
9
8
|
|
|
9
|
+
from unstructured_ingest.utils import ndjson
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
11
11
|
|
|
12
12
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
@@ -153,6 +153,16 @@ def get_data_by_suffix(path: Path) -> list[dict]:
|
|
|
153
153
|
raise ValueError(f"Unsupported file type: {path}")
|
|
154
154
|
|
|
155
155
|
|
|
156
|
+
def write_data(path: Path, data: list[dict], indent: int = 2) -> None:
|
|
157
|
+
with path.open("w") as f:
|
|
158
|
+
if path.suffix == ".json":
|
|
159
|
+
json.dump(data, f, indent=indent, ensure_ascii=False)
|
|
160
|
+
elif path.suffix == ".ndjson":
|
|
161
|
+
ndjson.dump(data, f, ensure_ascii=False)
|
|
162
|
+
else:
|
|
163
|
+
raise IOError("Unsupported file type: {path}")
|
|
164
|
+
|
|
165
|
+
|
|
156
166
|
def get_data(path: Path) -> list[dict]:
|
|
157
167
|
try:
|
|
158
168
|
return get_data_by_suffix(path=path)
|
|
@@ -179,8 +189,6 @@ def get_data(path: Path) -> list[dict]:
|
|
|
179
189
|
except Exception as e:
|
|
180
190
|
logger.warning(f"failed to read {path} as parquet: {e}")
|
|
181
191
|
|
|
182
|
-
raise IOError(f"File could not be parsed: {path}")
|
|
183
|
-
|
|
184
192
|
|
|
185
193
|
def get_data_df(path: Path) -> pd.DataFrame:
|
|
186
194
|
with path.open() as f:
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
from uuid import NAMESPACE_DNS, uuid5
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from bs4 import BeautifulSoup
|
|
9
|
+
from requests import Session
|
|
10
|
+
|
|
11
|
+
from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, SourceIdentifiers
|
|
12
|
+
from unstructured_ingest.v2.logger import logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def convert_image_tags(url: str, original_html: str, session: Optional[Session] = None) -> str:
|
|
16
|
+
session = session or requests.Session()
|
|
17
|
+
parsed_url = urlparse(url)
|
|
18
|
+
base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
|
19
|
+
soup = BeautifulSoup(original_html, "html.parser")
|
|
20
|
+
images = soup.find_all("img")
|
|
21
|
+
for image in images:
|
|
22
|
+
current_source = image["src"]
|
|
23
|
+
if current_source.startswith("//"):
|
|
24
|
+
source_url = f"{parsed_url.scheme}:{current_source}"
|
|
25
|
+
elif current_source.startswith("http"):
|
|
26
|
+
source_url = current_source
|
|
27
|
+
else:
|
|
28
|
+
source_url = base_url + current_source
|
|
29
|
+
try:
|
|
30
|
+
response = session.get(source_url)
|
|
31
|
+
response.raise_for_status()
|
|
32
|
+
image_content = response.content
|
|
33
|
+
logger.debug(
|
|
34
|
+
"img tag having src updated from {} to base64 content".format(image["src"])
|
|
35
|
+
)
|
|
36
|
+
image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.warning(
|
|
39
|
+
f"failed to download image content from {source_url}: {e}", exc_info=True
|
|
40
|
+
)
|
|
41
|
+
return str(soup)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def download_link(
|
|
45
|
+
download_dir: Path, link: str, session: Optional[Session] = None, force_download: bool = False
|
|
46
|
+
) -> Path:
|
|
47
|
+
session = session or requests.Session()
|
|
48
|
+
filename = Path(urlparse(url=link).path).name
|
|
49
|
+
download_path = download_dir / filename
|
|
50
|
+
logger.debug(f"downloading file from {link} to {download_path}")
|
|
51
|
+
if download_path.exists() and download_path.is_file() and not force_download:
|
|
52
|
+
return download_path
|
|
53
|
+
with download_path.open("wb") as downloaded_file:
|
|
54
|
+
response = session.get(link)
|
|
55
|
+
response.raise_for_status()
|
|
56
|
+
downloaded_file.write(response.content)
|
|
57
|
+
return download_path
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def download_embedded_files(
|
|
61
|
+
download_dir: Path,
|
|
62
|
+
original_filedata: FileData,
|
|
63
|
+
original_html: str,
|
|
64
|
+
session: Optional[Session] = None,
|
|
65
|
+
force_download: bool = False,
|
|
66
|
+
) -> list[DownloadResponse]:
|
|
67
|
+
session = session or requests.Session()
|
|
68
|
+
url = original_filedata.metadata.url
|
|
69
|
+
parsed_url = urlparse(url)
|
|
70
|
+
base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
|
71
|
+
soup = BeautifulSoup(original_html, "html.parser")
|
|
72
|
+
tags = soup.find_all("a", href=True)
|
|
73
|
+
hrefs = [
|
|
74
|
+
tag["href"]
|
|
75
|
+
for tag in tags
|
|
76
|
+
if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
|
|
77
|
+
]
|
|
78
|
+
results = []
|
|
79
|
+
for current_source in hrefs:
|
|
80
|
+
download_dir.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
if current_source.startswith("//"):
|
|
82
|
+
source_url = f"{parsed_url.scheme}:{current_source}"
|
|
83
|
+
elif current_source.startswith("http"):
|
|
84
|
+
source_url = current_source
|
|
85
|
+
else:
|
|
86
|
+
source_url = base_url + current_source
|
|
87
|
+
try:
|
|
88
|
+
downloaded_path = download_link(
|
|
89
|
+
download_dir=download_dir,
|
|
90
|
+
link=source_url,
|
|
91
|
+
session=session,
|
|
92
|
+
force_download=force_download,
|
|
93
|
+
)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.warning(f"failed to download file content from {source_url}: {e}")
|
|
96
|
+
continue
|
|
97
|
+
result_file_data = original_filedata.model_copy(deep=True)
|
|
98
|
+
result_file_data.metadata.url = source_url
|
|
99
|
+
result_file_data.metadata.record_locator["parent_url"] = url
|
|
100
|
+
result_file_data.identifier = str(
|
|
101
|
+
uuid5(NAMESPACE_DNS, source_url + original_filedata.identifier)
|
|
102
|
+
)
|
|
103
|
+
filename = Path(urlparse(url=source_url).path).name
|
|
104
|
+
result_file_data.source_identifiers = SourceIdentifiers(
|
|
105
|
+
filename=filename, fullpath=filename
|
|
106
|
+
)
|
|
107
|
+
result_file_data.local_download_path = downloaded_path.as_posix()
|
|
108
|
+
results.append(DownloadResponse(file_data=result_file_data, path=downloaded_path))
|
|
109
|
+
return results
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import IO, Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def dumps(obj: list[dict[str, Any]], **kwargs) -> str:
|
|
6
|
+
return "\n".join(json.dumps(each, **kwargs) for each in obj)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def dump(obj: list[dict[str, Any]], fp: IO, **kwargs) -> None:
|
|
10
|
+
# Indent breaks ndjson formatting
|
|
11
|
+
kwargs["indent"] = None
|
|
12
|
+
text = dumps(obj, **kwargs)
|
|
13
|
+
fp.write(text)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def loads(s: str, **kwargs) -> list[dict[str, Any]]:
|
|
17
|
+
return [json.loads(line, **kwargs) for line in s.splitlines()]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load(fp: IO, **kwargs) -> list[dict[str, Any]]:
|
|
21
|
+
return loads(fp.read(), **kwargs)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class writer(object):
|
|
25
|
+
def __init__(self, f, **kwargs):
|
|
26
|
+
self.f = f
|
|
27
|
+
self.kwargs = kwargs
|
|
28
|
+
|
|
29
|
+
def write(self, row):
|
|
30
|
+
stringified = json.dumps(row, **self.kwargs)
|
|
31
|
+
self.f.write(stringified + "\n")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class reader(object):
|
|
35
|
+
def __init__(self, f, **kwargs):
|
|
36
|
+
self.f = f
|
|
37
|
+
self.kwargs = kwargs
|
|
38
|
+
|
|
39
|
+
def __iter__(self):
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
def __next__(self):
|
|
43
|
+
line = ""
|
|
44
|
+
|
|
45
|
+
while line == "":
|
|
46
|
+
line = next(self.f).strip()
|
|
47
|
+
|
|
48
|
+
return json.loads(line, **self.kwargs)
|
|
49
|
+
|
|
50
|
+
# NOTE: this is necessary to comply with py27
|
|
51
|
+
def next(self):
|
|
52
|
+
return self.__next__()
|
|
@@ -2,11 +2,11 @@ import json
|
|
|
2
2
|
from abc import ABC
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Any,
|
|
5
|
+
from typing import Any, TypeVar
|
|
6
6
|
|
|
7
|
-
import ndjson
|
|
8
7
|
from pydantic import BaseModel
|
|
9
8
|
|
|
9
|
+
from unstructured_ingest.utils import ndjson
|
|
10
10
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
11
11
|
from unstructured_ingest.v2.interfaces.process import BaseProcess
|
|
12
12
|
|
|
@@ -22,16 +22,6 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
|
|
|
22
22
|
class UploadStager(BaseProcess, ABC):
|
|
23
23
|
upload_stager_config: UploadStagerConfigT
|
|
24
24
|
|
|
25
|
-
def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
|
|
26
|
-
if output_path.suffix == ".json":
|
|
27
|
-
with output_path.open("w") as f:
|
|
28
|
-
json.dump(data, f, indent=indent)
|
|
29
|
-
elif output_path.suffix == ".ndjson":
|
|
30
|
-
with output_path.open("w") as f:
|
|
31
|
-
ndjson.dump(data, f)
|
|
32
|
-
else:
|
|
33
|
-
raise ValueError(f"Unsupported output format: {output_path}")
|
|
34
|
-
|
|
35
25
|
def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
|
|
36
26
|
return element_dict
|
|
37
27
|
|
|
@@ -49,7 +39,7 @@ class UploadStager(BaseProcess, ABC):
|
|
|
49
39
|
writer = ndjson.writer(out_f)
|
|
50
40
|
for element in reader:
|
|
51
41
|
conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
|
|
52
|
-
writer.
|
|
42
|
+
writer.write(row=conformed_element)
|
|
53
43
|
writer.f.flush()
|
|
54
44
|
|
|
55
45
|
def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional, TypedDict
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
9
|
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -44,9 +44,8 @@ class ChunkStep(PipelineStep):
|
|
|
44
44
|
return filepath
|
|
45
45
|
|
|
46
46
|
def _save_output(self, output_filepath: str, chunked_content: list[dict]):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
json.dump(chunked_content, f, indent=2)
|
|
47
|
+
logger.debug(f"writing chunker output to: {output_filepath}")
|
|
48
|
+
write_data(path=Path(output_filepath), data=chunked_content)
|
|
50
49
|
|
|
51
50
|
async def _run_async(
|
|
52
51
|
self, fn: Callable, path: str, file_data_path: str, **kwargs
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional, TypedDict
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
9
|
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -44,9 +44,8 @@ class EmbedStep(PipelineStep):
|
|
|
44
44
|
return filepath
|
|
45
45
|
|
|
46
46
|
def _save_output(self, output_filepath: str, embedded_content: list[dict]):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
json.dump(embedded_content, f, indent=2)
|
|
47
|
+
logger.debug(f"writing embedded output to: {output_filepath}")
|
|
48
|
+
write_data(path=Path(output_filepath), data=embedded_content)
|
|
50
49
|
|
|
51
50
|
async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
|
|
52
51
|
path = Path(path)
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional, TypedDict
|
|
7
6
|
|
|
7
|
+
from unstructured_ingest.utils.data_prep import write_data
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
9
|
from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
|
|
10
10
|
from unstructured_ingest.v2.logger import logger
|
|
@@ -44,9 +44,8 @@ class PartitionStep(PipelineStep):
|
|
|
44
44
|
return filepath
|
|
45
45
|
|
|
46
46
|
def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
json.dump(partitioned_content, f, indent=2)
|
|
47
|
+
logger.debug(f"writing partitioned output to: {output_filepath}")
|
|
48
|
+
write_data(path=Path(output_filepath), data=partitioned_content)
|
|
50
49
|
|
|
51
50
|
async def _run_async(
|
|
52
51
|
self, fn: Callable, path: str, file_data_path: str
|