unstructured-ingest 0.0.0__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/connector/notion/helpers.py +1 -1
- unstructured_ingest/logger.py +2 -2
- unstructured_ingest/v2/cli/base/cmd.py +10 -0
- unstructured_ingest/v2/cli/base/src.py +2 -0
- unstructured_ingest/v2/cli/cmds/__init__.py +2 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
- unstructured_ingest/v2/cli/cmds/local.py +0 -8
- unstructured_ingest/v2/cli/cmds/milvus.py +72 -0
- unstructured_ingest/v2/cli/configs/__init__.py +8 -1
- unstructured_ingest/v2/cli/configs/filter.py +28 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/downloader.py +9 -3
- unstructured_ingest/v2/interfaces/file_data.py +6 -1
- unstructured_ingest/v2/interfaces/process.py +3 -0
- unstructured_ingest/v2/logger.py +1 -1
- unstructured_ingest/v2/pipeline/interfaces.py +3 -1
- unstructured_ingest/v2/pipeline/pipeline.py +72 -2
- unstructured_ingest/v2/pipeline/steps/download.py +77 -13
- unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
- unstructured_ingest/v2/processes/connectors/__init__.py +4 -2
- unstructured_ingest/v2/processes/connectors/astra.py +8 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +22 -31
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -5
- unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
- unstructured_ingest/v2/processes/connectors/local.py +15 -15
- unstructured_ingest/v2/processes/connectors/milvus.py +200 -0
- unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
- unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +10 -7
- unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
- unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
- unstructured_ingest/v2/processes/connectors/sql.py +24 -9
- unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
- unstructured_ingest/v2/processes/filter.py +54 -0
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/METADATA +16 -14
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/RECORD +44 -39
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.0.dist-info → unstructured_ingest-0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.2" # pragma: no cover
|
|
@@ -5,7 +5,6 @@ from typing import List, Optional, Tuple
|
|
|
5
5
|
from urllib.parse import urlparse
|
|
6
6
|
from uuid import UUID
|
|
7
7
|
|
|
8
|
-
import unstructured.ingest.connector.notion.types.blocks as notion_blocks
|
|
9
8
|
from htmlBuilder.attributes import Style, Type
|
|
10
9
|
from htmlBuilder.tags import (
|
|
11
10
|
Body,
|
|
@@ -23,6 +22,7 @@ from htmlBuilder.tags import (
|
|
|
23
22
|
)
|
|
24
23
|
from notion_client.errors import APIResponseError
|
|
25
24
|
|
|
25
|
+
import unstructured_ingest.connector.notion.types.blocks as notion_blocks
|
|
26
26
|
from unstructured_ingest.connector.notion.client import Client
|
|
27
27
|
from unstructured_ingest.connector.notion.interfaces import BlockBase
|
|
28
28
|
from unstructured_ingest.connector.notion.types.block import Block
|
unstructured_ingest/logger.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
import typing as t
|
|
5
5
|
|
|
6
|
-
logger = logging.getLogger("
|
|
6
|
+
logger = logging.getLogger("unstructured_ingest")
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def default_is_data_sensitive(k: str, v: t.Any) -> bool:
|
|
@@ -119,7 +119,7 @@ def ingest_log_streaming_init(level: int) -> None:
|
|
|
119
119
|
|
|
120
120
|
def make_default_logger(level: int) -> logging.Logger:
|
|
121
121
|
"""Return a custom logger."""
|
|
122
|
-
logger = logging.getLogger("
|
|
122
|
+
logger = logging.getLogger("unstructured_ingest")
|
|
123
123
|
handler = logging.StreamHandler()
|
|
124
124
|
handler.name = "ingest_log_handler"
|
|
125
125
|
formatter = SensitiveFormatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s")
|
|
@@ -24,6 +24,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
24
24
|
)
|
|
25
25
|
from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
|
|
26
26
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
27
|
+
from unstructured_ingest.v2.processes.filter import Filterer, FiltererConfig
|
|
27
28
|
from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
|
|
28
29
|
|
|
29
30
|
CommandT = TypeVar("CommandT", bound=click.Command)
|
|
@@ -75,6 +76,8 @@ class BaseCmd(ABC):
|
|
|
75
76
|
}
|
|
76
77
|
if chunker := self.get_chunker(options=source_options):
|
|
77
78
|
pipeline_kwargs["chunker"] = chunker
|
|
79
|
+
if filterer := self.get_filterer(options=source_options):
|
|
80
|
+
pipeline_kwargs["filterer"] = filterer
|
|
78
81
|
if embedder := self.get_embeder(options=source_options):
|
|
79
82
|
pipeline_kwargs["embedder"] = embedder
|
|
80
83
|
if dest:
|
|
@@ -105,6 +108,13 @@ class BaseCmd(ABC):
|
|
|
105
108
|
return None
|
|
106
109
|
return Chunker(config=chunker_config)
|
|
107
110
|
|
|
111
|
+
@staticmethod
|
|
112
|
+
def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
|
|
113
|
+
filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
|
|
114
|
+
if not filterer_configs.to_dict():
|
|
115
|
+
return None
|
|
116
|
+
return Filterer(config=filterer_configs)
|
|
117
|
+
|
|
108
118
|
@staticmethod
|
|
109
119
|
def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
|
|
110
120
|
embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
|
|
@@ -8,6 +8,7 @@ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
|
|
|
8
8
|
from unstructured_ingest.v2.cli.configs import (
|
|
9
9
|
ChunkerCliConfig,
|
|
10
10
|
EmbedderCliConfig,
|
|
11
|
+
FilterCliConfig,
|
|
11
12
|
PartitionerCliConfig,
|
|
12
13
|
ProcessorCliConfig,
|
|
13
14
|
)
|
|
@@ -26,6 +27,7 @@ class SrcCmd(BaseCmd):
|
|
|
26
27
|
ProcessorCliConfig,
|
|
27
28
|
PartitionerCliConfig,
|
|
28
29
|
EmbedderCliConfig,
|
|
30
|
+
FilterCliConfig,
|
|
29
31
|
ChunkerCliConfig,
|
|
30
32
|
]
|
|
31
33
|
)
|
|
@@ -15,6 +15,7 @@ from .fsspec.s3 import s3_dest_cmd, s3_src_cmd
|
|
|
15
15
|
from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd
|
|
16
16
|
from .google_drive import google_drive_src_cmd
|
|
17
17
|
from .local import local_dest_cmd, local_src_cmd
|
|
18
|
+
from .milvus import milvus_dest_cmd
|
|
18
19
|
from .mongodb import mongodb_dest_cmd
|
|
19
20
|
from .onedrive import onedrive_drive_src_cmd
|
|
20
21
|
from .opensearch import opensearch_dest_cmd, opensearch_src_cmd
|
|
@@ -60,6 +61,7 @@ dest_cmds = [
|
|
|
60
61
|
elasticsearch_dest_cmd,
|
|
61
62
|
gcs_dest_cmd,
|
|
62
63
|
local_dest_cmd,
|
|
64
|
+
milvus_dest_cmd,
|
|
63
65
|
opensearch_dest_cmd,
|
|
64
66
|
pinecone_dest_cmd,
|
|
65
67
|
s3_dest_cmd,
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass
|
|
|
3
3
|
import click
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
6
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
@dataclass
|
|
@@ -14,7 +13,7 @@ class FsspecCliDownloadConfig(CliConfig):
|
|
|
14
13
|
click.Option(
|
|
15
14
|
["--download-dir"],
|
|
16
15
|
help="Where files are downloaded to, defaults to a location at"
|
|
17
|
-
"`$HOME/.cache/
|
|
16
|
+
"`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
|
|
18
17
|
),
|
|
19
18
|
]
|
|
20
19
|
|
|
@@ -65,13 +64,6 @@ class FsspecCliIndexerConfig(FsspecCliFileConfig):
|
|
|
65
64
|
help="Recursively download files in their respective folders "
|
|
66
65
|
"otherwise stop at the files in provided folder level.",
|
|
67
66
|
),
|
|
68
|
-
click.Option(
|
|
69
|
-
["--file-glob"],
|
|
70
|
-
default=None,
|
|
71
|
-
type=DelimitedString(),
|
|
72
|
-
help="A comma-separated list of file globs to limit which types of "
|
|
73
|
-
"local files are accepted, e.g. '*.html,*.txt'",
|
|
74
|
-
),
|
|
75
67
|
]
|
|
76
68
|
)
|
|
77
69
|
return options
|
|
@@ -4,7 +4,6 @@ import click
|
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
6
|
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
8
7
|
from unstructured_ingest.v2.processes.connectors.local import CONNECTOR_TYPE
|
|
9
8
|
|
|
10
9
|
|
|
@@ -19,13 +18,6 @@ class LocalCliIndexerConfig(CliConfig):
|
|
|
19
18
|
type=click.Path(file_okay=True, dir_okay=True, exists=True),
|
|
20
19
|
help="Path to the location in the local file system that will be processed.",
|
|
21
20
|
),
|
|
22
|
-
click.Option(
|
|
23
|
-
["--file-glob"],
|
|
24
|
-
default=None,
|
|
25
|
-
type=DelimitedString(),
|
|
26
|
-
help="A comma-separated list of file globs to limit which types of "
|
|
27
|
-
"local files are accepted, e.g. '*.html,*.txt'",
|
|
28
|
-
),
|
|
29
21
|
click.Option(
|
|
30
22
|
["--recursive"],
|
|
31
23
|
is_flag=True,
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.cli.base import DestCmd
|
|
6
|
+
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
+
from unstructured_ingest.v2.processes.connectors.milvus import CONNECTOR_TYPE
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class MilvusCliConnectionConfig(CliConfig):
|
|
12
|
+
@staticmethod
|
|
13
|
+
def get_cli_options() -> list[click.Option]:
|
|
14
|
+
options = [
|
|
15
|
+
click.Option(
|
|
16
|
+
["--uri"],
|
|
17
|
+
required=False,
|
|
18
|
+
type=str,
|
|
19
|
+
default=None,
|
|
20
|
+
help="Milvus uri, eg 'http://localhost:19530",
|
|
21
|
+
),
|
|
22
|
+
click.Option(
|
|
23
|
+
["--user"],
|
|
24
|
+
required=False,
|
|
25
|
+
type=str,
|
|
26
|
+
default=None,
|
|
27
|
+
help="Milvus user",
|
|
28
|
+
),
|
|
29
|
+
click.Option(
|
|
30
|
+
["--password"],
|
|
31
|
+
required=False,
|
|
32
|
+
type=str,
|
|
33
|
+
default=None,
|
|
34
|
+
help="Milvus password",
|
|
35
|
+
),
|
|
36
|
+
click.Option(
|
|
37
|
+
["--db-name"],
|
|
38
|
+
required=False,
|
|
39
|
+
type=str,
|
|
40
|
+
default=None,
|
|
41
|
+
help="Milvus database name",
|
|
42
|
+
),
|
|
43
|
+
]
|
|
44
|
+
return options
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class MilvusCliUploaderConfig(CliConfig):
|
|
49
|
+
@staticmethod
|
|
50
|
+
def get_cli_options() -> list[click.Option]:
|
|
51
|
+
options = [
|
|
52
|
+
click.Option(
|
|
53
|
+
["--collection-name"],
|
|
54
|
+
required=True,
|
|
55
|
+
type=str,
|
|
56
|
+
help="Milvus collections to write to",
|
|
57
|
+
),
|
|
58
|
+
click.Option(
|
|
59
|
+
["--num-of-processes"],
|
|
60
|
+
type=click.IntRange(min=1),
|
|
61
|
+
default=4,
|
|
62
|
+
help="number of processes to use when writing to support parallel writes",
|
|
63
|
+
),
|
|
64
|
+
]
|
|
65
|
+
return options
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
milvus_dest_cmd = DestCmd(
|
|
69
|
+
cmd_name=CONNECTOR_TYPE,
|
|
70
|
+
connection_config=MilvusCliConnectionConfig,
|
|
71
|
+
uploader_config=MilvusCliUploaderConfig,
|
|
72
|
+
)
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
from .chunk import ChunkerCliConfig
|
|
2
2
|
from .embed import EmbedderCliConfig
|
|
3
|
+
from .filter import FilterCliConfig
|
|
3
4
|
from .partition import PartitionerCliConfig
|
|
4
5
|
from .processor import ProcessorCliConfig
|
|
5
6
|
|
|
6
|
-
__all__ = [
|
|
7
|
+
__all__ = [
|
|
8
|
+
"ChunkerCliConfig",
|
|
9
|
+
"ProcessorCliConfig",
|
|
10
|
+
"PartitionerCliConfig",
|
|
11
|
+
"EmbedderCliConfig",
|
|
12
|
+
"FilterCliConfig",
|
|
13
|
+
]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
6
|
+
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class FilterCliConfig(CliConfig):
|
|
11
|
+
@staticmethod
|
|
12
|
+
def get_cli_options() -> list[click.Option]:
|
|
13
|
+
options = [
|
|
14
|
+
click.Option(
|
|
15
|
+
["--file-glob"],
|
|
16
|
+
default=None,
|
|
17
|
+
type=DelimitedString(),
|
|
18
|
+
help="A comma-separated list of file globs to limit which types of "
|
|
19
|
+
"local files are accepted, e.g. '*.html,*.txt'",
|
|
20
|
+
),
|
|
21
|
+
click.Option(
|
|
22
|
+
["--max-file-size"],
|
|
23
|
+
default=None,
|
|
24
|
+
type=click.IntRange(min=1),
|
|
25
|
+
help="Max file size to process in bytes",
|
|
26
|
+
),
|
|
27
|
+
]
|
|
28
|
+
return options
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from .connector import AccessConfig, BaseConnector, ConnectionConfig
|
|
2
2
|
from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
|
|
3
|
-
from .file_data import FileData, SourceIdentifiers
|
|
3
|
+
from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
4
4
|
from .indexer import Indexer, IndexerConfig
|
|
5
5
|
from .process import BaseProcess
|
|
6
6
|
from .processor import ProcessorConfig
|
|
@@ -26,4 +26,5 @@ __all__ = [
|
|
|
26
26
|
"AccessConfig",
|
|
27
27
|
"ConnectionConfig",
|
|
28
28
|
"BaseConnector",
|
|
29
|
+
"FileDataSourceMetadata",
|
|
29
30
|
]
|
|
@@ -30,6 +30,15 @@ class Downloader(BaseProcess, BaseConnector, ABC):
|
|
|
30
30
|
connector_type: str
|
|
31
31
|
download_config: DownloaderConfigT
|
|
32
32
|
|
|
33
|
+
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
34
|
+
if not file_data.source_identifiers:
|
|
35
|
+
return None
|
|
36
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
37
|
+
if not rel_path:
|
|
38
|
+
return None
|
|
39
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
40
|
+
return self.download_dir / Path(rel_path)
|
|
41
|
+
|
|
33
42
|
@staticmethod
|
|
34
43
|
def is_float(value: str):
|
|
35
44
|
try:
|
|
@@ -68,9 +77,6 @@ class Downloader(BaseProcess, BaseConnector, ABC):
|
|
|
68
77
|
def is_async(self) -> bool:
|
|
69
78
|
return True
|
|
70
79
|
|
|
71
|
-
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
72
|
-
return None
|
|
73
|
-
|
|
74
80
|
@abstractmethod
|
|
75
81
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
76
82
|
pass
|
|
@@ -22,13 +22,18 @@ class SourceIdentifiers:
|
|
|
22
22
|
return self.rel_path or self.fullpath
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
@dataclass
|
|
26
|
+
class FileDataSourceMetadata(DataSourceMetadata):
|
|
27
|
+
filesize_bytes: Optional[int] = None
|
|
28
|
+
|
|
29
|
+
|
|
25
30
|
@dataclass
|
|
26
31
|
class FileData(DataClassJsonMixin):
|
|
27
32
|
identifier: str
|
|
28
33
|
connector_type: str
|
|
29
34
|
source_identifiers: Optional[SourceIdentifiers] = None
|
|
30
35
|
doc_type: Literal["file", "batch"] = field(default="file")
|
|
31
|
-
metadata:
|
|
36
|
+
metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
|
|
32
37
|
additional_metadata: dict[str, Any] = field(default_factory=dict)
|
|
33
38
|
reprocess: bool = False
|
|
34
39
|
|
unstructured_ingest/v2/logger.py
CHANGED
|
@@ -5,7 +5,7 @@ from logging import Formatter, Logger, StreamHandler, getLevelName, getLogger
|
|
|
5
5
|
from typing import Any, Callable
|
|
6
6
|
|
|
7
7
|
log_level = os.getenv("INGEST_LOG_LEVEL", "INFO")
|
|
8
|
-
LOGGER_NAME = "
|
|
8
|
+
LOGGER_NAME = "unstructured_ingest.v2"
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def default_is_data_sensitive(k: str, v: Any) -> bool:
|
|
@@ -92,7 +92,7 @@ class PipelineStep(ABC):
|
|
|
92
92
|
|
|
93
93
|
if iterable:
|
|
94
94
|
if len(iterable) == 1:
|
|
95
|
-
return
|
|
95
|
+
return self.process_serially(iterable)
|
|
96
96
|
if self.context.num_processes == 1:
|
|
97
97
|
return self.process_serially(iterable)
|
|
98
98
|
with mp.Pool(
|
|
@@ -126,6 +126,8 @@ class PipelineStep(ABC):
|
|
|
126
126
|
logger.info(
|
|
127
127
|
f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
128
128
|
)
|
|
129
|
+
else:
|
|
130
|
+
logger.info(f"Calling {self.__class__.__name__} with no inputs")
|
|
129
131
|
if self.context.async_supported and self.process.is_async():
|
|
130
132
|
return self.process_async(iterable=iterable)
|
|
131
133
|
if self.context.mp_supported:
|
|
@@ -9,6 +9,7 @@ from unstructured_ingest.v2.logger import logger, make_default_logger
|
|
|
9
9
|
from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
|
|
10
10
|
from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
|
|
11
11
|
from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
|
|
12
|
+
from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
|
|
12
13
|
from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
|
|
13
14
|
from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
|
|
14
15
|
from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
|
|
@@ -27,6 +28,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
27
28
|
)
|
|
28
29
|
from unstructured_ingest.v2.processes.connectors.local import LocalUploader
|
|
29
30
|
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
|
|
31
|
+
from unstructured_ingest.v2.processes.filter import FiltererConfig
|
|
30
32
|
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
|
|
31
33
|
|
|
32
34
|
|
|
@@ -37,22 +39,33 @@ class PipelineError(Exception):
|
|
|
37
39
|
@dataclass
|
|
38
40
|
class Pipeline:
|
|
39
41
|
context: ProcessorConfig
|
|
42
|
+
|
|
40
43
|
indexer: InitVar[IndexerT]
|
|
41
44
|
indexer_step: IndexStep = field(init=False)
|
|
45
|
+
|
|
42
46
|
downloader: InitVar[DownloaderT]
|
|
43
47
|
downloader_step: DownloadStep = field(init=False)
|
|
48
|
+
|
|
44
49
|
partitioner: InitVar[Partitioner]
|
|
45
50
|
partitioner_step: PartitionStep = field(init=False)
|
|
51
|
+
|
|
46
52
|
chunker: InitVar[Optional[Chunker]] = None
|
|
47
53
|
chunker_step: ChunkStep = field(init=False, default=None)
|
|
54
|
+
|
|
48
55
|
embedder: InitVar[Optional[Embedder]] = None
|
|
49
56
|
embedder_step: EmbedStep = field(init=False, default=None)
|
|
57
|
+
|
|
50
58
|
stager: InitVar[Optional[UploadStager]] = None
|
|
51
59
|
stager_step: UploadStageStep = field(init=False, default=None)
|
|
60
|
+
|
|
52
61
|
uploader: InitVar[Uploader] = field(default=LocalUploader())
|
|
53
62
|
uploader_step: UploadStep = field(init=False, default=None)
|
|
63
|
+
|
|
54
64
|
uncompress_step: UncompressStep = field(init=False, default=None)
|
|
55
65
|
|
|
66
|
+
filterer: InitVar[Optional[Filterer]] = None
|
|
67
|
+
filter_step: FilterStep = field(init=False, default=None)
|
|
68
|
+
|
|
56
69
|
def __post_init__(
|
|
57
70
|
self,
|
|
58
71
|
indexer: IndexerT,
|
|
@@ -62,10 +75,12 @@ class Pipeline:
|
|
|
62
75
|
embedder: Embedder = None,
|
|
63
76
|
stager: UploadStager = None,
|
|
64
77
|
uploader: Uploader = None,
|
|
78
|
+
filterer: Filterer = None,
|
|
65
79
|
):
|
|
66
80
|
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
|
|
67
81
|
self.indexer_step = IndexStep(process=indexer, context=self.context)
|
|
68
82
|
self.downloader_step = DownloadStep(process=downloader, context=self.context)
|
|
83
|
+
self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
|
|
69
84
|
self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
|
|
70
85
|
self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
|
|
71
86
|
|
|
@@ -109,6 +124,7 @@ class Pipeline:
|
|
|
109
124
|
def run(self):
|
|
110
125
|
try:
|
|
111
126
|
start_time = time()
|
|
127
|
+
self._run_prechecks()
|
|
112
128
|
self._run()
|
|
113
129
|
logger.info(f"Finished ingest process in {time() - start_time}s")
|
|
114
130
|
finally:
|
|
@@ -130,6 +146,37 @@ class Pipeline:
|
|
|
130
146
|
final = [f for f in flat if f]
|
|
131
147
|
return final or None
|
|
132
148
|
|
|
149
|
+
def _run_prechecks(self):
|
|
150
|
+
steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
|
|
151
|
+
if self.chunker_step:
|
|
152
|
+
steps.append(self.chunker_step)
|
|
153
|
+
if self.embedder_step:
|
|
154
|
+
steps.append(self.embedder_step)
|
|
155
|
+
if self.uncompress_step:
|
|
156
|
+
steps.append(self.uncompress_step)
|
|
157
|
+
if self.stager_step:
|
|
158
|
+
steps.append(self.stager_step)
|
|
159
|
+
failures = {}
|
|
160
|
+
for step in steps:
|
|
161
|
+
try:
|
|
162
|
+
step.process.precheck()
|
|
163
|
+
except Exception as e:
|
|
164
|
+
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
165
|
+
if failures:
|
|
166
|
+
for k, v in failures.items():
|
|
167
|
+
logger.error(f"Step precheck failure: {k}: {v}")
|
|
168
|
+
raise PipelineError("Precheck failed")
|
|
169
|
+
|
|
170
|
+
def apply_filter(self, records: list[dict]) -> list[dict]:
|
|
171
|
+
if not self.filter_step:
|
|
172
|
+
return records
|
|
173
|
+
data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
|
|
174
|
+
filtered_data = self.filter_step(data_to_filter)
|
|
175
|
+
filtered_data = [f for f in filtered_data if f is not None]
|
|
176
|
+
filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
|
|
177
|
+
filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
|
|
178
|
+
return filtered_records
|
|
179
|
+
|
|
133
180
|
def _run(self):
|
|
134
181
|
logger.info(
|
|
135
182
|
f"Running local pipline: {self} with configs: "
|
|
@@ -147,18 +194,33 @@ class Pipeline:
|
|
|
147
194
|
if not indices_inputs:
|
|
148
195
|
return
|
|
149
196
|
|
|
197
|
+
# Initial filtering on indexed content
|
|
198
|
+
indices_inputs = self.apply_filter(records=indices_inputs)
|
|
199
|
+
if not indices_inputs:
|
|
200
|
+
return
|
|
201
|
+
|
|
150
202
|
# Download associated content to local file system
|
|
151
203
|
downloaded_data = self.downloader_step(indices_inputs)
|
|
152
204
|
downloaded_data = self.clean_results(results=downloaded_data)
|
|
153
205
|
if not downloaded_data:
|
|
154
206
|
return
|
|
155
207
|
|
|
208
|
+
# Post download filtering
|
|
209
|
+
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
210
|
+
if not downloaded_data:
|
|
211
|
+
return
|
|
212
|
+
|
|
156
213
|
# Run uncompress if available
|
|
157
214
|
if self.uncompress_step:
|
|
158
215
|
downloaded_data = self.uncompress_step(downloaded_data)
|
|
159
216
|
# Flatten list of lists
|
|
160
217
|
downloaded_data = self.clean_results(results=downloaded_data)
|
|
161
218
|
|
|
219
|
+
# Post uncompress filtering
|
|
220
|
+
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
221
|
+
if not downloaded_data:
|
|
222
|
+
return
|
|
223
|
+
|
|
162
224
|
if not downloaded_data:
|
|
163
225
|
return
|
|
164
226
|
|
|
@@ -179,9 +241,14 @@ class Pipeline:
|
|
|
179
241
|
self.uploader_step(iterable=elements)
|
|
180
242
|
|
|
181
243
|
def __str__(self):
|
|
182
|
-
s = [str(self.indexer_step)
|
|
244
|
+
s = [str(self.indexer_step)]
|
|
245
|
+
if filter_step := self.filter_step:
|
|
246
|
+
s.append(str(filter_step))
|
|
247
|
+
s.append(str(self.downloader_step))
|
|
248
|
+
if filter_step := self.filter_step:
|
|
249
|
+
s.append(str(filter_step))
|
|
183
250
|
if uncompress_step := self.uncompress_step:
|
|
184
|
-
s.
|
|
251
|
+
s.extend([str(uncompress_step), str(filter_step)])
|
|
185
252
|
s.append(str(self.partitioner_step))
|
|
186
253
|
if chunker_step := self.chunker_step:
|
|
187
254
|
s.append(str(chunker_step))
|
|
@@ -200,6 +267,7 @@ class Pipeline:
|
|
|
200
267
|
downloader_config: DownloaderConfigT,
|
|
201
268
|
source_connection_config: ConnectionConfig,
|
|
202
269
|
partitioner_config: PartitionerConfig,
|
|
270
|
+
filterer_config: FiltererConfig = None,
|
|
203
271
|
chunker_config: Optional[ChunkerConfig] = None,
|
|
204
272
|
embedder_config: Optional[EmbedderConfig] = None,
|
|
205
273
|
destination_connection_config: Optional[ConnectionConfig] = None,
|
|
@@ -235,6 +303,8 @@ class Pipeline:
|
|
|
235
303
|
),
|
|
236
304
|
"partitioner": Partitioner(config=partitioner_config),
|
|
237
305
|
}
|
|
306
|
+
if filterer_config:
|
|
307
|
+
pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
|
|
238
308
|
if chunker_config:
|
|
239
309
|
pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
|
|
240
310
|
if embedder_config:
|
|
@@ -2,6 +2,7 @@ import asyncio
|
|
|
2
2
|
import hashlib
|
|
3
3
|
import json
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
5
6
|
from typing import Callable, Optional, TypedDict, TypeVar
|
|
6
7
|
|
|
7
8
|
from unstructured_ingest.v2.interfaces import FileData, download_responses
|
|
@@ -70,11 +71,40 @@ class DownloadStep(PipelineStep):
|
|
|
70
71
|
return True
|
|
71
72
|
return False
|
|
72
73
|
|
|
74
|
+
def update_file_data(
|
|
75
|
+
self, file_data: FileData, file_data_path: Path, download_path: Path
|
|
76
|
+
) -> None:
|
|
77
|
+
file_size_bytes = download_path.stat().st_size
|
|
78
|
+
changed = False
|
|
79
|
+
if not file_data.metadata.filesize_bytes and file_size_bytes:
|
|
80
|
+
changed = True
|
|
81
|
+
file_data.metadata.filesize_bytes = file_size_bytes
|
|
82
|
+
if (
|
|
83
|
+
file_data.metadata.filesize_bytes
|
|
84
|
+
and file_data.metadata.filesize_bytes != file_size_bytes
|
|
85
|
+
):
|
|
86
|
+
logger.warning(
|
|
87
|
+
f"file size in original file data "
|
|
88
|
+
f"({file_data.metadata.filesize_bytes}) doesn't "
|
|
89
|
+
f"match size of local file: {file_size_bytes}, updating"
|
|
90
|
+
)
|
|
91
|
+
changed = True
|
|
92
|
+
file_data.metadata.filesize_bytes = file_size_bytes
|
|
93
|
+
if changed:
|
|
94
|
+
logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
|
|
95
|
+
with file_data_path.open("w") as file:
|
|
96
|
+
json.dump(file_data.to_dict(), file, indent=2)
|
|
97
|
+
|
|
73
98
|
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
74
99
|
file_data = FileData.from_file(path=file_data_path)
|
|
75
100
|
download_path = self.process.get_download_path(file_data=file_data)
|
|
76
101
|
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
77
102
|
logger.debug(f"Skipping download, file already exists locally: {download_path}")
|
|
103
|
+
self.update_file_data(
|
|
104
|
+
file_data=file_data,
|
|
105
|
+
file_data_path=Path(file_data_path),
|
|
106
|
+
download_path=download_path,
|
|
107
|
+
)
|
|
78
108
|
return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
|
|
79
109
|
fn_kwargs = {"file_data": file_data}
|
|
80
110
|
if not asyncio.iscoroutinefunction(fn):
|
|
@@ -85,26 +115,60 @@ class DownloadStep(PipelineStep):
|
|
|
85
115
|
else:
|
|
86
116
|
download_results = await fn(**fn_kwargs)
|
|
87
117
|
return self.create_step_results(
|
|
88
|
-
current_file_data_path=file_data_path,
|
|
118
|
+
current_file_data_path=file_data_path,
|
|
119
|
+
download_results=download_results,
|
|
120
|
+
current_file_data=file_data,
|
|
89
121
|
)
|
|
90
122
|
|
|
91
123
|
def create_step_results(
|
|
92
|
-
self,
|
|
124
|
+
self,
|
|
125
|
+
current_file_data_path: str,
|
|
126
|
+
current_file_data: FileData,
|
|
127
|
+
download_results: download_responses,
|
|
93
128
|
) -> list[DownloadStepResponse]:
|
|
129
|
+
responses = []
|
|
94
130
|
if not isinstance(download_results, list):
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
131
|
+
file_data = current_file_data
|
|
132
|
+
file_data_path = current_file_data_path
|
|
133
|
+
download_path = download_results["path"]
|
|
134
|
+
if download_results["file_data"].identifier == current_file_data.identifier:
|
|
135
|
+
self.update_file_data(
|
|
136
|
+
file_data=file_data,
|
|
137
|
+
file_data_path=Path(file_data_path),
|
|
138
|
+
download_path=download_path,
|
|
139
|
+
)
|
|
140
|
+
responses = [
|
|
141
|
+
DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
|
|
142
|
+
]
|
|
143
|
+
else:
|
|
144
|
+
file_data = download_results["file_data"]
|
|
145
|
+
file_data_path = self.persist_new_file_data(file_data=file_data)
|
|
146
|
+
self.update_file_data(
|
|
147
|
+
file_data=file_data,
|
|
148
|
+
file_data_path=Path(file_data_path),
|
|
149
|
+
download_path=download_path,
|
|
98
150
|
)
|
|
99
|
-
|
|
151
|
+
responses = [
|
|
152
|
+
DownloadStepResponse(
|
|
153
|
+
file_data_path=current_file_data_path, path=str(download_results["path"])
|
|
154
|
+
)
|
|
155
|
+
]
|
|
156
|
+
else:
|
|
100
157
|
# Supplemental results generated as part of the download process
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
158
|
+
for res in download_results:
|
|
159
|
+
file_data = res["file_data"]
|
|
160
|
+
file_data_path = self.persist_new_file_data(file_data=file_data)
|
|
161
|
+
download_path = res["path"]
|
|
162
|
+
self.update_file_data(
|
|
163
|
+
file_data=file_data,
|
|
164
|
+
file_data_path=Path(file_data_path),
|
|
165
|
+
download_path=download_path,
|
|
166
|
+
)
|
|
167
|
+
responses.append(
|
|
168
|
+
DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return responses
|
|
108
172
|
|
|
109
173
|
def persist_new_file_data(self, file_data: FileData) -> str:
|
|
110
174
|
record_hash = self.get_hash(extras=[file_data.identifier])
|