unstructured-ingest 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (34) hide show
  1. test/integration/connectors/test_confluence.py +4 -4
  2. test/integration/connectors/utils/validation/equality.py +2 -1
  3. test/unit/v2/connectors/sql/__init__.py +0 -0
  4. test/unit/v2/connectors/sql/test_sql.py +74 -0
  5. test/unit/v2/connectors/test_confluence.py +6 -6
  6. unstructured_ingest/__version__.py +1 -1
  7. unstructured_ingest/utils/data_prep.py +11 -3
  8. unstructured_ingest/utils/html.py +109 -0
  9. unstructured_ingest/utils/ndjson.py +52 -0
  10. unstructured_ingest/v2/interfaces/upload_stager.py +3 -13
  11. unstructured_ingest/v2/pipeline/steps/chunk.py +3 -4
  12. unstructured_ingest/v2/pipeline/steps/embed.py +3 -4
  13. unstructured_ingest/v2/pipeline/steps/partition.py +3 -4
  14. unstructured_ingest/v2/processes/connectors/confluence.py +125 -35
  15. unstructured_ingest/v2/processes/connectors/duckdb/base.py +2 -2
  16. unstructured_ingest/v2/processes/connectors/fsspec/azure.py +8 -8
  17. unstructured_ingest/v2/processes/connectors/fsspec/box.py +7 -7
  18. unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +9 -9
  19. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +41 -9
  20. unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +7 -7
  21. unstructured_ingest/v2/processes/connectors/fsspec/s3.py +8 -8
  22. unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +5 -5
  23. unstructured_ingest/v2/processes/connectors/sql/__init__.py +4 -0
  24. unstructured_ingest/v2/processes/connectors/sql/singlestore.py +2 -1
  25. unstructured_ingest/v2/processes/connectors/sql/snowflake.py +1 -1
  26. unstructured_ingest/v2/processes/connectors/sql/sql.py +14 -8
  27. unstructured_ingest/v2/processes/connectors/sql/sqlite.py +2 -1
  28. unstructured_ingest/v2/processes/connectors/sql/vastdb.py +270 -0
  29. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/METADATA +18 -15
  30. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/RECORD +34 -29
  31. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/LICENSE.md +0 -0
  32. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/WHEEL +0 -0
  33. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/entry_points.txt +0 -0
  34. {unstructured_ingest-0.3.15.dist-info → unstructured_ingest-0.4.1.dist-info}/top_level.txt +0 -0
@@ -30,10 +30,10 @@ async def test_confluence_source(temp_dir):
30
30
  spaces = ["testteamsp", "MFS"]
31
31
 
32
32
  # Create connection and indexer configurations
33
- access_config = ConfluenceAccessConfig(api_token=api_token)
33
+ access_config = ConfluenceAccessConfig(password=api_token)
34
34
  connection_config = ConfluenceConnectionConfig(
35
35
  url=confluence_url,
36
- user_email=user_email,
36
+ username=user_email,
37
37
  access_config=access_config,
38
38
  )
39
39
  index_config = ConfluenceIndexerConfig(
@@ -77,10 +77,10 @@ async def test_confluence_source_large(temp_dir):
77
77
  spaces = ["testteamsp1"]
78
78
 
79
79
  # Create connection and indexer configurations
80
- access_config = ConfluenceAccessConfig(api_token=api_token)
80
+ access_config = ConfluenceAccessConfig(password=api_token)
81
81
  connection_config = ConfluenceConnectionConfig(
82
82
  url=confluence_url,
83
- user_email=user_email,
83
+ username=user_email,
84
84
  access_config=access_config,
85
85
  )
86
86
  index_config = ConfluenceIndexerConfig(
@@ -1,10 +1,11 @@
1
1
  import json
2
2
  from pathlib import Path
3
3
 
4
- import ndjson
5
4
  from bs4 import BeautifulSoup
6
5
  from deepdiff import DeepDiff
7
6
 
7
+ from unstructured_ingest.utils import ndjson
8
+
8
9
 
9
10
  def json_equality_check(expected_filepath: Path, current_filepath: Path) -> bool:
10
11
  with expected_filepath.open() as f:
File without changes
@@ -0,0 +1,74 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+ from pytest_mock import MockerFixture
5
+
6
+ from unstructured_ingest.v2.interfaces.file_data import FileData, SourceIdentifiers
7
+ from unstructured_ingest.v2.processes.connectors.sql.sql import SQLUploadStager
8
+
9
+
10
+ @pytest.fixture
11
+ def mock_instance() -> SQLUploadStager:
12
+ return SQLUploadStager()
13
+
14
+
15
+ @pytest.mark.parametrize(
16
+ ("input_filepath", "output_filename", "expected"),
17
+ [
18
+ (
19
+ "/path/to/input_file.ndjson",
20
+ "output_file.ndjson",
21
+ "output_file.ndjson",
22
+ ),
23
+ ("input_file.txt", "output_file.json", "output_file.txt"),
24
+ ("/path/to/input_file.json", "output_file", "output_file.json"),
25
+ ],
26
+ )
27
+ def test_run_output_filename_suffix(
28
+ mocker: MockerFixture,
29
+ mock_instance: SQLUploadStager,
30
+ input_filepath: str,
31
+ output_filename: str,
32
+ expected: str,
33
+ ):
34
+ output_dir = Path("/tmp/test/output_dir")
35
+
36
+ # Mocks
37
+ mock_get_data = mocker.patch(
38
+ "unstructured_ingest.v2.processes.connectors.sql.sql.get_data",
39
+ return_value=[{"key": "value"}, {"key": "value2"}],
40
+ )
41
+ mock_conform_dict = mocker.patch.object(
42
+ SQLUploadStager, "conform_dict", side_effect=lambda element_dict, file_data: element_dict
43
+ )
44
+ mock_conform_dataframe = mocker.patch.object(
45
+ SQLUploadStager, "conform_dataframe", side_effect=lambda df: df
46
+ )
47
+ mock_get_output_path = mocker.patch.object(
48
+ SQLUploadStager, "get_output_path", return_value=output_dir / expected
49
+ )
50
+ mock_write_output = mocker.patch(
51
+ "unstructured_ingest.v2.processes.connectors.sql.sql.write_data", return_value=None
52
+ )
53
+
54
+ # Act
55
+ result = mock_instance.run(
56
+ elements_filepath=Path(input_filepath),
57
+ file_data=FileData(
58
+ identifier="test",
59
+ connector_type="test",
60
+ source_identifiers=SourceIdentifiers(filename=input_filepath, fullpath=input_filepath),
61
+ ),
62
+ output_dir=output_dir,
63
+ output_filename=output_filename,
64
+ )
65
+
66
+ # Assert
67
+ mock_get_data.assert_called_once_with(path=Path(input_filepath))
68
+ assert mock_conform_dict.call_count == 2
69
+ mock_conform_dataframe.assert_called_once()
70
+ mock_get_output_path.assert_called_once_with(output_filename=expected, output_dir=output_dir)
71
+ mock_write_output.assert_called_once_with(
72
+ path=output_dir / expected, data=[{"key": "value"}, {"key": "value2"}]
73
+ )
74
+ assert result.name == expected
@@ -11,10 +11,10 @@ def test_connection_config_multiple_auth():
11
11
  with pytest.raises(ValidationError):
12
12
  ConfluenceConnectionConfig(
13
13
  access_config=ConfluenceAccessConfig(
14
- api_token="api_token",
15
- access_token="access_token",
14
+ password="api_token",
15
+ token="access_token",
16
16
  ),
17
- user_email="user_email",
17
+ username="user_email",
18
18
  url="url",
19
19
  )
20
20
 
@@ -26,14 +26,14 @@ def test_connection_config_no_auth():
26
26
 
27
27
  def test_connection_config_basic_auth():
28
28
  ConfluenceConnectionConfig(
29
- access_config=ConfluenceAccessConfig(api_token="api_token"),
29
+ access_config=ConfluenceAccessConfig(password="api_token"),
30
30
  url="url",
31
- user_email="user_email",
31
+ username="user_email",
32
32
  )
33
33
 
34
34
 
35
35
  def test_connection_config_pat_auth():
36
36
  ConfluenceConnectionConfig(
37
- access_config=ConfluenceAccessConfig(access_token="access_token"),
37
+ access_config=ConfluenceAccessConfig(token="access_token"),
38
38
  url="url",
39
39
  )
@@ -1 +1 @@
1
- __version__ = "0.3.15" # pragma: no cover
1
+ __version__ = "0.4.1" # pragma: no cover
@@ -4,9 +4,9 @@ from datetime import datetime
4
4
  from pathlib import Path
5
5
  from typing import Any, Generator, Iterable, Optional, Sequence, TypeVar, cast
6
6
 
7
- import ndjson
8
7
  import pandas as pd
9
8
 
9
+ from unstructured_ingest.utils import ndjson
10
10
  from unstructured_ingest.v2.logger import logger
11
11
 
12
12
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
@@ -153,6 +153,16 @@ def get_data_by_suffix(path: Path) -> list[dict]:
153
153
  raise ValueError(f"Unsupported file type: {path}")
154
154
 
155
155
 
156
+ def write_data(path: Path, data: list[dict], indent: int = 2) -> None:
157
+ with path.open("w") as f:
158
+ if path.suffix == ".json":
159
+ json.dump(data, f, indent=indent, ensure_ascii=False)
160
+ elif path.suffix == ".ndjson":
161
+ ndjson.dump(data, f, ensure_ascii=False)
162
+ else:
163
+ raise IOError("Unsupported file type: {path}")
164
+
165
+
156
166
  def get_data(path: Path) -> list[dict]:
157
167
  try:
158
168
  return get_data_by_suffix(path=path)
@@ -179,8 +189,6 @@ def get_data(path: Path) -> list[dict]:
179
189
  except Exception as e:
180
190
  logger.warning(f"failed to read {path} as parquet: {e}")
181
191
 
182
- raise IOError(f"File could not be parsed: {path}")
183
-
184
192
 
185
193
  def get_data_df(path: Path) -> pd.DataFrame:
186
194
  with path.open() as f:
@@ -0,0 +1,109 @@
1
+ import base64
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ from urllib.parse import urlparse
5
+ from uuid import NAMESPACE_DNS, uuid5
6
+
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from requests import Session
10
+
11
+ from unstructured_ingest.v2.interfaces import DownloadResponse, FileData, SourceIdentifiers
12
+ from unstructured_ingest.v2.logger import logger
13
+
14
+
15
+ def convert_image_tags(url: str, original_html: str, session: Optional[Session] = None) -> str:
16
+ session = session or requests.Session()
17
+ parsed_url = urlparse(url)
18
+ base_url = parsed_url.scheme + "://" + parsed_url.netloc
19
+ soup = BeautifulSoup(original_html, "html.parser")
20
+ images = soup.find_all("img")
21
+ for image in images:
22
+ current_source = image["src"]
23
+ if current_source.startswith("//"):
24
+ source_url = f"{parsed_url.scheme}:{current_source}"
25
+ elif current_source.startswith("http"):
26
+ source_url = current_source
27
+ else:
28
+ source_url = base_url + current_source
29
+ try:
30
+ response = session.get(source_url)
31
+ response.raise_for_status()
32
+ image_content = response.content
33
+ logger.debug(
34
+ "img tag having src updated from {} to base64 content".format(image["src"])
35
+ )
36
+ image["src"] = f"data:image/png;base64,{base64.b64encode(image_content).decode()}"
37
+ except Exception as e:
38
+ logger.warning(
39
+ f"failed to download image content from {source_url}: {e}", exc_info=True
40
+ )
41
+ return str(soup)
42
+
43
+
44
+ def download_link(
45
+ download_dir: Path, link: str, session: Optional[Session] = None, force_download: bool = False
46
+ ) -> Path:
47
+ session = session or requests.Session()
48
+ filename = Path(urlparse(url=link).path).name
49
+ download_path = download_dir / filename
50
+ logger.debug(f"downloading file from {link} to {download_path}")
51
+ if download_path.exists() and download_path.is_file() and not force_download:
52
+ return download_path
53
+ with download_path.open("wb") as downloaded_file:
54
+ response = session.get(link)
55
+ response.raise_for_status()
56
+ downloaded_file.write(response.content)
57
+ return download_path
58
+
59
+
60
+ def download_embedded_files(
61
+ download_dir: Path,
62
+ original_filedata: FileData,
63
+ original_html: str,
64
+ session: Optional[Session] = None,
65
+ force_download: bool = False,
66
+ ) -> list[DownloadResponse]:
67
+ session = session or requests.Session()
68
+ url = original_filedata.metadata.url
69
+ parsed_url = urlparse(url)
70
+ base_url = parsed_url.scheme + "://" + parsed_url.netloc
71
+ soup = BeautifulSoup(original_html, "html.parser")
72
+ tags = soup.find_all("a", href=True)
73
+ hrefs = [
74
+ tag["href"]
75
+ for tag in tags
76
+ if not tag["href"].startswith("#") and Path(tag["href"]).suffix != ""
77
+ ]
78
+ results = []
79
+ for current_source in hrefs:
80
+ download_dir.mkdir(parents=True, exist_ok=True)
81
+ if current_source.startswith("//"):
82
+ source_url = f"{parsed_url.scheme}:{current_source}"
83
+ elif current_source.startswith("http"):
84
+ source_url = current_source
85
+ else:
86
+ source_url = base_url + current_source
87
+ try:
88
+ downloaded_path = download_link(
89
+ download_dir=download_dir,
90
+ link=source_url,
91
+ session=session,
92
+ force_download=force_download,
93
+ )
94
+ except Exception as e:
95
+ logger.warning(f"failed to download file content from {source_url}: {e}")
96
+ continue
97
+ result_file_data = original_filedata.model_copy(deep=True)
98
+ result_file_data.metadata.url = source_url
99
+ result_file_data.metadata.record_locator["parent_url"] = url
100
+ result_file_data.identifier = str(
101
+ uuid5(NAMESPACE_DNS, source_url + original_filedata.identifier)
102
+ )
103
+ filename = Path(urlparse(url=source_url).path).name
104
+ result_file_data.source_identifiers = SourceIdentifiers(
105
+ filename=filename, fullpath=filename
106
+ )
107
+ result_file_data.local_download_path = downloaded_path.as_posix()
108
+ results.append(DownloadResponse(file_data=result_file_data, path=downloaded_path))
109
+ return results
@@ -0,0 +1,52 @@
1
+ import json
2
+ from typing import IO, Any
3
+
4
+
5
+ def dumps(obj: list[dict[str, Any]], **kwargs) -> str:
6
+ return "\n".join(json.dumps(each, **kwargs) for each in obj)
7
+
8
+
9
+ def dump(obj: list[dict[str, Any]], fp: IO, **kwargs) -> None:
10
+ # Indent breaks ndjson formatting
11
+ kwargs["indent"] = None
12
+ text = dumps(obj, **kwargs)
13
+ fp.write(text)
14
+
15
+
16
+ def loads(s: str, **kwargs) -> list[dict[str, Any]]:
17
+ return [json.loads(line, **kwargs) for line in s.splitlines()]
18
+
19
+
20
+ def load(fp: IO, **kwargs) -> list[dict[str, Any]]:
21
+ return loads(fp.read(), **kwargs)
22
+
23
+
24
+ class writer(object):
25
+ def __init__(self, f, **kwargs):
26
+ self.f = f
27
+ self.kwargs = kwargs
28
+
29
+ def write(self, row):
30
+ stringified = json.dumps(row, **self.kwargs)
31
+ self.f.write(stringified + "\n")
32
+
33
+
34
+ class reader(object):
35
+ def __init__(self, f, **kwargs):
36
+ self.f = f
37
+ self.kwargs = kwargs
38
+
39
+ def __iter__(self):
40
+ return self
41
+
42
+ def __next__(self):
43
+ line = ""
44
+
45
+ while line == "":
46
+ line = next(self.f).strip()
47
+
48
+ return json.loads(line, **self.kwargs)
49
+
50
+ # NOTE: this is necessary to comply with py27
51
+ def next(self):
52
+ return self.__next__()
@@ -2,11 +2,11 @@ import json
2
2
  from abc import ABC
3
3
  from dataclasses import dataclass
4
4
  from pathlib import Path
5
- from typing import Any, Optional, TypeVar
5
+ from typing import Any, TypeVar
6
6
 
7
- import ndjson
8
7
  from pydantic import BaseModel
9
8
 
9
+ from unstructured_ingest.utils import ndjson
10
10
  from unstructured_ingest.v2.interfaces.file_data import FileData
11
11
  from unstructured_ingest.v2.interfaces.process import BaseProcess
12
12
 
@@ -22,16 +22,6 @@ UploadStagerConfigT = TypeVar("UploadStagerConfigT", bound=UploadStagerConfig)
22
22
  class UploadStager(BaseProcess, ABC):
23
23
  upload_stager_config: UploadStagerConfigT
24
24
 
25
- def write_output(self, output_path: Path, data: list[dict], indent: Optional[int] = 2) -> None:
26
- if output_path.suffix == ".json":
27
- with output_path.open("w") as f:
28
- json.dump(data, f, indent=indent)
29
- elif output_path.suffix == ".ndjson":
30
- with output_path.open("w") as f:
31
- ndjson.dump(data, f)
32
- else:
33
- raise ValueError(f"Unsupported output format: {output_path}")
34
-
35
25
  def conform_dict(self, element_dict: dict, file_data: FileData) -> dict:
36
26
  return element_dict
37
27
 
@@ -49,7 +39,7 @@ class UploadStager(BaseProcess, ABC):
49
39
  writer = ndjson.writer(out_f)
50
40
  for element in reader:
51
41
  conformed_element = self.conform_dict(element_dict=element, file_data=file_data)
52
- writer.writerow(row=conformed_element)
42
+ writer.write(row=conformed_element)
53
43
  writer.f.flush()
54
44
 
55
45
  def process_whole(self, input_file: Path, output_file: Path, file_data: FileData) -> None:
@@ -1,10 +1,10 @@
1
1
  import asyncio
2
2
  import hashlib
3
- import json
4
3
  from dataclasses import dataclass
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional, TypedDict
7
6
 
7
+ from unstructured_ingest.utils.data_prep import write_data
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
9
  from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
10
  from unstructured_ingest.v2.logger import logger
@@ -44,9 +44,8 @@ class ChunkStep(PipelineStep):
44
44
  return filepath
45
45
 
46
46
  def _save_output(self, output_filepath: str, chunked_content: list[dict]):
47
- with open(str(output_filepath), "w") as f:
48
- logger.debug(f"writing chunker output to: {output_filepath}")
49
- json.dump(chunked_content, f, indent=2)
47
+ logger.debug(f"writing chunker output to: {output_filepath}")
48
+ write_data(path=Path(output_filepath), data=chunked_content)
50
49
 
51
50
  async def _run_async(
52
51
  self, fn: Callable, path: str, file_data_path: str, **kwargs
@@ -1,10 +1,10 @@
1
1
  import asyncio
2
2
  import hashlib
3
- import json
4
3
  from dataclasses import dataclass
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional, TypedDict
7
6
 
7
+ from unstructured_ingest.utils.data_prep import write_data
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
9
  from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
10
  from unstructured_ingest.v2.logger import logger
@@ -44,9 +44,8 @@ class EmbedStep(PipelineStep):
44
44
  return filepath
45
45
 
46
46
  def _save_output(self, output_filepath: str, embedded_content: list[dict]):
47
- with open(str(output_filepath), "w") as f:
48
- logger.debug(f"writing embedded output to: {output_filepath}")
49
- json.dump(embedded_content, f, indent=2)
47
+ logger.debug(f"writing embedded output to: {output_filepath}")
48
+ write_data(path=Path(output_filepath), data=embedded_content)
50
49
 
51
50
  async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
52
51
  path = Path(path)
@@ -1,10 +1,10 @@
1
1
  import asyncio
2
2
  import hashlib
3
- import json
4
3
  from dataclasses import dataclass
5
4
  from pathlib import Path
6
5
  from typing import Callable, Optional, TypedDict
7
6
 
7
+ from unstructured_ingest.utils.data_prep import write_data
8
8
  from unstructured_ingest.v2.interfaces import FileData
9
9
  from unstructured_ingest.v2.interfaces.file_data import file_data_from_file
10
10
  from unstructured_ingest.v2.logger import logger
@@ -44,9 +44,8 @@ class PartitionStep(PipelineStep):
44
44
  return filepath
45
45
 
46
46
  def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
47
- with open(str(output_filepath), "w") as f:
48
- logger.debug(f"writing partitioned output to: {output_filepath}")
49
- json.dump(partitioned_content, f, indent=2)
47
+ logger.debug(f"writing partitioned output to: {output_filepath}")
48
+ write_data(path=Path(output_filepath), data=partitioned_content)
50
49
 
51
50
  async def _run_async(
52
51
  self, fn: Callable, path: str, file_data_path: str