unstructured-ingest 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/v2/cli/base/cmd.py +10 -0
- unstructured_ingest/v2/cli/base/src.py +2 -0
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +1 -9
- unstructured_ingest/v2/cli/cmds/local.py +0 -8
- unstructured_ingest/v2/cli/configs/__init__.py +8 -1
- unstructured_ingest/v2/cli/configs/filter.py +28 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/downloader.py +9 -3
- unstructured_ingest/v2/interfaces/file_data.py +6 -1
- unstructured_ingest/v2/interfaces/process.py +3 -0
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +72 -2
- unstructured_ingest/v2/pipeline/steps/download.py +77 -13
- unstructured_ingest/v2/pipeline/steps/filter.py +40 -0
- unstructured_ingest/v2/processes/connectors/astra.py +8 -0
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +8 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +8 -6
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +9 -0
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +23 -9
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +22 -31
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +13 -5
- unstructured_ingest/v2/processes/connectors/google_drive.py +13 -9
- unstructured_ingest/v2/processes/connectors/local.py +15 -15
- unstructured_ingest/v2/processes/connectors/mongodb.py +10 -4
- unstructured_ingest/v2/processes/connectors/onedrive.py +14 -2
- unstructured_ingest/v2/processes/connectors/pinecone.py +6 -3
- unstructured_ingest/v2/processes/connectors/salesforce.py +10 -8
- unstructured_ingest/v2/processes/connectors/sharepoint.py +14 -8
- unstructured_ingest/v2/processes/connectors/sql.py +24 -9
- unstructured_ingest/v2/processes/connectors/weaviate.py +13 -5
- unstructured_ingest/v2/processes/filter.py +54 -0
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/METADATA +13 -13
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/RECORD +37 -34
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.1.dist-info → unstructured_ingest-0.0.2.dist-info}/top_level.txt +0 -0
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.2" # pragma: no cover
|
|
@@ -24,6 +24,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
24
24
|
)
|
|
25
25
|
from unstructured_ingest.v2.processes.connectors.local import LocalUploader, LocalUploaderConfig
|
|
26
26
|
from unstructured_ingest.v2.processes.embedder import Embedder, EmbedderConfig
|
|
27
|
+
from unstructured_ingest.v2.processes.filter import Filterer, FiltererConfig
|
|
27
28
|
from unstructured_ingest.v2.processes.partitioner import Partitioner, PartitionerConfig
|
|
28
29
|
|
|
29
30
|
CommandT = TypeVar("CommandT", bound=click.Command)
|
|
@@ -75,6 +76,8 @@ class BaseCmd(ABC):
|
|
|
75
76
|
}
|
|
76
77
|
if chunker := self.get_chunker(options=source_options):
|
|
77
78
|
pipeline_kwargs["chunker"] = chunker
|
|
79
|
+
if filterer := self.get_filterer(options=source_options):
|
|
80
|
+
pipeline_kwargs["filterer"] = filterer
|
|
78
81
|
if embedder := self.get_embeder(options=source_options):
|
|
79
82
|
pipeline_kwargs["embedder"] = embedder
|
|
80
83
|
if dest:
|
|
@@ -105,6 +108,13 @@ class BaseCmd(ABC):
|
|
|
105
108
|
return None
|
|
106
109
|
return Chunker(config=chunker_config)
|
|
107
110
|
|
|
111
|
+
@staticmethod
|
|
112
|
+
def get_filterer(options: dict[str, Any]) -> Optional[Filterer]:
|
|
113
|
+
filterer_configs = extract_config(flat_data=options, config=FiltererConfig)
|
|
114
|
+
if not filterer_configs.to_dict():
|
|
115
|
+
return None
|
|
116
|
+
return Filterer(config=filterer_configs)
|
|
117
|
+
|
|
108
118
|
@staticmethod
|
|
109
119
|
def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
|
|
110
120
|
embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
|
|
@@ -8,6 +8,7 @@ from unstructured_ingest.v2.cli.base.cmd import BaseCmd
|
|
|
8
8
|
from unstructured_ingest.v2.cli.configs import (
|
|
9
9
|
ChunkerCliConfig,
|
|
10
10
|
EmbedderCliConfig,
|
|
11
|
+
FilterCliConfig,
|
|
11
12
|
PartitionerCliConfig,
|
|
12
13
|
ProcessorCliConfig,
|
|
13
14
|
)
|
|
@@ -26,6 +27,7 @@ class SrcCmd(BaseCmd):
|
|
|
26
27
|
ProcessorCliConfig,
|
|
27
28
|
PartitionerCliConfig,
|
|
28
29
|
EmbedderCliConfig,
|
|
30
|
+
FilterCliConfig,
|
|
29
31
|
ChunkerCliConfig,
|
|
30
32
|
]
|
|
31
33
|
)
|
|
@@ -3,7 +3,6 @@ from dataclasses import dataclass
|
|
|
3
3
|
import click
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
6
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
7
6
|
|
|
8
7
|
|
|
9
8
|
@dataclass
|
|
@@ -14,7 +13,7 @@ class FsspecCliDownloadConfig(CliConfig):
|
|
|
14
13
|
click.Option(
|
|
15
14
|
["--download-dir"],
|
|
16
15
|
help="Where files are downloaded to, defaults to a location at"
|
|
17
|
-
"`$HOME/.cache/
|
|
16
|
+
"`$HOME/.cache/unstructured/ingest/<connector name>/<SHA256>`.",
|
|
18
17
|
),
|
|
19
18
|
]
|
|
20
19
|
|
|
@@ -65,13 +64,6 @@ class FsspecCliIndexerConfig(FsspecCliFileConfig):
|
|
|
65
64
|
help="Recursively download files in their respective folders "
|
|
66
65
|
"otherwise stop at the files in provided folder level.",
|
|
67
66
|
),
|
|
68
|
-
click.Option(
|
|
69
|
-
["--file-glob"],
|
|
70
|
-
default=None,
|
|
71
|
-
type=DelimitedString(),
|
|
72
|
-
help="A comma-separated list of file globs to limit which types of "
|
|
73
|
-
"local files are accepted, e.g. '*.html,*.txt'",
|
|
74
|
-
),
|
|
75
67
|
]
|
|
76
68
|
)
|
|
77
69
|
return options
|
|
@@ -4,7 +4,6 @@ import click
|
|
|
4
4
|
|
|
5
5
|
from unstructured_ingest.v2.cli.base import DestCmd, SrcCmd
|
|
6
6
|
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
7
|
-
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
8
7
|
from unstructured_ingest.v2.processes.connectors.local import CONNECTOR_TYPE
|
|
9
8
|
|
|
10
9
|
|
|
@@ -19,13 +18,6 @@ class LocalCliIndexerConfig(CliConfig):
|
|
|
19
18
|
type=click.Path(file_okay=True, dir_okay=True, exists=True),
|
|
20
19
|
help="Path to the location in the local file system that will be processed.",
|
|
21
20
|
),
|
|
22
|
-
click.Option(
|
|
23
|
-
["--file-glob"],
|
|
24
|
-
default=None,
|
|
25
|
-
type=DelimitedString(),
|
|
26
|
-
help="A comma-separated list of file globs to limit which types of "
|
|
27
|
-
"local files are accepted, e.g. '*.html,*.txt'",
|
|
28
|
-
),
|
|
29
21
|
click.Option(
|
|
30
22
|
["--recursive"],
|
|
31
23
|
is_flag=True,
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
from .chunk import ChunkerCliConfig
|
|
2
2
|
from .embed import EmbedderCliConfig
|
|
3
|
+
from .filter import FilterCliConfig
|
|
3
4
|
from .partition import PartitionerCliConfig
|
|
4
5
|
from .processor import ProcessorCliConfig
|
|
5
6
|
|
|
6
|
-
__all__ = [
|
|
7
|
+
__all__ = [
|
|
8
|
+
"ChunkerCliConfig",
|
|
9
|
+
"ProcessorCliConfig",
|
|
10
|
+
"PartitionerCliConfig",
|
|
11
|
+
"EmbedderCliConfig",
|
|
12
|
+
"FilterCliConfig",
|
|
13
|
+
]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
import click
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.cli.interfaces import CliConfig
|
|
6
|
+
from unstructured_ingest.v2.cli.utils import DelimitedString
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class FilterCliConfig(CliConfig):
|
|
11
|
+
@staticmethod
|
|
12
|
+
def get_cli_options() -> list[click.Option]:
|
|
13
|
+
options = [
|
|
14
|
+
click.Option(
|
|
15
|
+
["--file-glob"],
|
|
16
|
+
default=None,
|
|
17
|
+
type=DelimitedString(),
|
|
18
|
+
help="A comma-separated list of file globs to limit which types of "
|
|
19
|
+
"local files are accepted, e.g. '*.html,*.txt'",
|
|
20
|
+
),
|
|
21
|
+
click.Option(
|
|
22
|
+
["--max-file-size"],
|
|
23
|
+
default=None,
|
|
24
|
+
type=click.IntRange(min=1),
|
|
25
|
+
help="Max file size to process in bytes",
|
|
26
|
+
),
|
|
27
|
+
]
|
|
28
|
+
return options
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from .connector import AccessConfig, BaseConnector, ConnectionConfig
|
|
2
2
|
from .downloader import Downloader, DownloaderConfig, DownloadResponse, download_responses
|
|
3
|
-
from .file_data import FileData, SourceIdentifiers
|
|
3
|
+
from .file_data import FileData, FileDataSourceMetadata, SourceIdentifiers
|
|
4
4
|
from .indexer import Indexer, IndexerConfig
|
|
5
5
|
from .process import BaseProcess
|
|
6
6
|
from .processor import ProcessorConfig
|
|
@@ -26,4 +26,5 @@ __all__ = [
|
|
|
26
26
|
"AccessConfig",
|
|
27
27
|
"ConnectionConfig",
|
|
28
28
|
"BaseConnector",
|
|
29
|
+
"FileDataSourceMetadata",
|
|
29
30
|
]
|
|
@@ -30,6 +30,15 @@ class Downloader(BaseProcess, BaseConnector, ABC):
|
|
|
30
30
|
connector_type: str
|
|
31
31
|
download_config: DownloaderConfigT
|
|
32
32
|
|
|
33
|
+
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
34
|
+
if not file_data.source_identifiers:
|
|
35
|
+
return None
|
|
36
|
+
rel_path = file_data.source_identifiers.relative_path
|
|
37
|
+
if not rel_path:
|
|
38
|
+
return None
|
|
39
|
+
rel_path = rel_path[1:] if rel_path.startswith("/") else rel_path
|
|
40
|
+
return self.download_dir / Path(rel_path)
|
|
41
|
+
|
|
33
42
|
@staticmethod
|
|
34
43
|
def is_float(value: str):
|
|
35
44
|
try:
|
|
@@ -68,9 +77,6 @@ class Downloader(BaseProcess, BaseConnector, ABC):
|
|
|
68
77
|
def is_async(self) -> bool:
|
|
69
78
|
return True
|
|
70
79
|
|
|
71
|
-
def get_download_path(self, file_data: FileData) -> Optional[Path]:
|
|
72
|
-
return None
|
|
73
|
-
|
|
74
80
|
@abstractmethod
|
|
75
81
|
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
|
|
76
82
|
pass
|
|
@@ -22,13 +22,18 @@ class SourceIdentifiers:
|
|
|
22
22
|
return self.rel_path or self.fullpath
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
@dataclass
|
|
26
|
+
class FileDataSourceMetadata(DataSourceMetadata):
|
|
27
|
+
filesize_bytes: Optional[int] = None
|
|
28
|
+
|
|
29
|
+
|
|
25
30
|
@dataclass
|
|
26
31
|
class FileData(DataClassJsonMixin):
|
|
27
32
|
identifier: str
|
|
28
33
|
connector_type: str
|
|
29
34
|
source_identifiers: Optional[SourceIdentifiers] = None
|
|
30
35
|
doc_type: Literal["file", "batch"] = field(default="file")
|
|
31
|
-
metadata:
|
|
36
|
+
metadata: FileDataSourceMetadata = field(default_factory=lambda: FileDataSourceMetadata())
|
|
32
37
|
additional_metadata: dict[str, Any] = field(default_factory=dict)
|
|
33
38
|
reprocess: bool = False
|
|
34
39
|
|
|
@@ -92,7 +92,7 @@ class PipelineStep(ABC):
|
|
|
92
92
|
|
|
93
93
|
if iterable:
|
|
94
94
|
if len(iterable) == 1:
|
|
95
|
-
return
|
|
95
|
+
return self.process_serially(iterable)
|
|
96
96
|
if self.context.num_processes == 1:
|
|
97
97
|
return self.process_serially(iterable)
|
|
98
98
|
with mp.Pool(
|
|
@@ -126,6 +126,8 @@ class PipelineStep(ABC):
|
|
|
126
126
|
logger.info(
|
|
127
127
|
f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
128
128
|
)
|
|
129
|
+
else:
|
|
130
|
+
logger.info(f"Calling {self.__class__.__name__} with no inputs")
|
|
129
131
|
if self.context.async_supported and self.process.is_async():
|
|
130
132
|
return self.process_async(iterable=iterable)
|
|
131
133
|
if self.context.mp_supported:
|
|
@@ -146,8 +148,6 @@ class PipelineStep(ABC):
|
|
|
146
148
|
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
147
149
|
if "file_data_path" in kwargs:
|
|
148
150
|
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
149
|
-
else:
|
|
150
|
-
self.context.status[self.identifier] = {"step_error": str(e)}
|
|
151
151
|
if self.context.raise_on_error:
|
|
152
152
|
raise e
|
|
153
153
|
return None
|
|
@@ -160,8 +160,6 @@ class PipelineStep(ABC):
|
|
|
160
160
|
logger.error(f"Exception raised while running {self.identifier}", exc_info=e)
|
|
161
161
|
if "file_data_path" in kwargs:
|
|
162
162
|
self.context.status[kwargs["file_data_path"]] = {self.identifier: str(e)}
|
|
163
|
-
else:
|
|
164
|
-
self.context.status[self.identifier] = {"step_error": str(e)}
|
|
165
163
|
if self.context.raise_on_error:
|
|
166
164
|
raise e
|
|
167
165
|
return None
|
|
@@ -9,6 +9,7 @@ from unstructured_ingest.v2.logger import logger, make_default_logger
|
|
|
9
9
|
from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
|
|
10
10
|
from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
|
|
11
11
|
from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
|
|
12
|
+
from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
|
|
12
13
|
from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
|
|
13
14
|
from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
|
|
14
15
|
from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
|
|
@@ -27,6 +28,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
27
28
|
)
|
|
28
29
|
from unstructured_ingest.v2.processes.connectors.local import LocalUploader
|
|
29
30
|
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
|
|
31
|
+
from unstructured_ingest.v2.processes.filter import FiltererConfig
|
|
30
32
|
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
|
|
31
33
|
|
|
32
34
|
|
|
@@ -37,22 +39,33 @@ class PipelineError(Exception):
|
|
|
37
39
|
@dataclass
|
|
38
40
|
class Pipeline:
|
|
39
41
|
context: ProcessorConfig
|
|
42
|
+
|
|
40
43
|
indexer: InitVar[IndexerT]
|
|
41
44
|
indexer_step: IndexStep = field(init=False)
|
|
45
|
+
|
|
42
46
|
downloader: InitVar[DownloaderT]
|
|
43
47
|
downloader_step: DownloadStep = field(init=False)
|
|
48
|
+
|
|
44
49
|
partitioner: InitVar[Partitioner]
|
|
45
50
|
partitioner_step: PartitionStep = field(init=False)
|
|
51
|
+
|
|
46
52
|
chunker: InitVar[Optional[Chunker]] = None
|
|
47
53
|
chunker_step: ChunkStep = field(init=False, default=None)
|
|
54
|
+
|
|
48
55
|
embedder: InitVar[Optional[Embedder]] = None
|
|
49
56
|
embedder_step: EmbedStep = field(init=False, default=None)
|
|
57
|
+
|
|
50
58
|
stager: InitVar[Optional[UploadStager]] = None
|
|
51
59
|
stager_step: UploadStageStep = field(init=False, default=None)
|
|
60
|
+
|
|
52
61
|
uploader: InitVar[Uploader] = field(default=LocalUploader())
|
|
53
62
|
uploader_step: UploadStep = field(init=False, default=None)
|
|
63
|
+
|
|
54
64
|
uncompress_step: UncompressStep = field(init=False, default=None)
|
|
55
65
|
|
|
66
|
+
filterer: InitVar[Optional[Filterer]] = None
|
|
67
|
+
filter_step: FilterStep = field(init=False, default=None)
|
|
68
|
+
|
|
56
69
|
def __post_init__(
|
|
57
70
|
self,
|
|
58
71
|
indexer: IndexerT,
|
|
@@ -62,10 +75,12 @@ class Pipeline:
|
|
|
62
75
|
embedder: Embedder = None,
|
|
63
76
|
stager: UploadStager = None,
|
|
64
77
|
uploader: Uploader = None,
|
|
78
|
+
filterer: Filterer = None,
|
|
65
79
|
):
|
|
66
80
|
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
|
|
67
81
|
self.indexer_step = IndexStep(process=indexer, context=self.context)
|
|
68
82
|
self.downloader_step = DownloadStep(process=downloader, context=self.context)
|
|
83
|
+
self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
|
|
69
84
|
self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
|
|
70
85
|
self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
|
|
71
86
|
|
|
@@ -109,6 +124,7 @@ class Pipeline:
|
|
|
109
124
|
def run(self):
|
|
110
125
|
try:
|
|
111
126
|
start_time = time()
|
|
127
|
+
self._run_prechecks()
|
|
112
128
|
self._run()
|
|
113
129
|
logger.info(f"Finished ingest process in {time() - start_time}s")
|
|
114
130
|
finally:
|
|
@@ -130,6 +146,37 @@ class Pipeline:
|
|
|
130
146
|
final = [f for f in flat if f]
|
|
131
147
|
return final or None
|
|
132
148
|
|
|
149
|
+
def _run_prechecks(self):
|
|
150
|
+
steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
|
|
151
|
+
if self.chunker_step:
|
|
152
|
+
steps.append(self.chunker_step)
|
|
153
|
+
if self.embedder_step:
|
|
154
|
+
steps.append(self.embedder_step)
|
|
155
|
+
if self.uncompress_step:
|
|
156
|
+
steps.append(self.uncompress_step)
|
|
157
|
+
if self.stager_step:
|
|
158
|
+
steps.append(self.stager_step)
|
|
159
|
+
failures = {}
|
|
160
|
+
for step in steps:
|
|
161
|
+
try:
|
|
162
|
+
step.process.precheck()
|
|
163
|
+
except Exception as e:
|
|
164
|
+
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
165
|
+
if failures:
|
|
166
|
+
for k, v in failures.items():
|
|
167
|
+
logger.error(f"Step precheck failure: {k}: {v}")
|
|
168
|
+
raise PipelineError("Precheck failed")
|
|
169
|
+
|
|
170
|
+
def apply_filter(self, records: list[dict]) -> list[dict]:
|
|
171
|
+
if not self.filter_step:
|
|
172
|
+
return records
|
|
173
|
+
data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
|
|
174
|
+
filtered_data = self.filter_step(data_to_filter)
|
|
175
|
+
filtered_data = [f for f in filtered_data if f is not None]
|
|
176
|
+
filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
|
|
177
|
+
filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
|
|
178
|
+
return filtered_records
|
|
179
|
+
|
|
133
180
|
def _run(self):
|
|
134
181
|
logger.info(
|
|
135
182
|
f"Running local pipline: {self} with configs: "
|
|
@@ -147,18 +194,33 @@ class Pipeline:
|
|
|
147
194
|
if not indices_inputs:
|
|
148
195
|
return
|
|
149
196
|
|
|
197
|
+
# Initial filtering on indexed content
|
|
198
|
+
indices_inputs = self.apply_filter(records=indices_inputs)
|
|
199
|
+
if not indices_inputs:
|
|
200
|
+
return
|
|
201
|
+
|
|
150
202
|
# Download associated content to local file system
|
|
151
203
|
downloaded_data = self.downloader_step(indices_inputs)
|
|
152
204
|
downloaded_data = self.clean_results(results=downloaded_data)
|
|
153
205
|
if not downloaded_data:
|
|
154
206
|
return
|
|
155
207
|
|
|
208
|
+
# Post download filtering
|
|
209
|
+
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
210
|
+
if not downloaded_data:
|
|
211
|
+
return
|
|
212
|
+
|
|
156
213
|
# Run uncompress if available
|
|
157
214
|
if self.uncompress_step:
|
|
158
215
|
downloaded_data = self.uncompress_step(downloaded_data)
|
|
159
216
|
# Flatten list of lists
|
|
160
217
|
downloaded_data = self.clean_results(results=downloaded_data)
|
|
161
218
|
|
|
219
|
+
# Post uncompress filtering
|
|
220
|
+
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
221
|
+
if not downloaded_data:
|
|
222
|
+
return
|
|
223
|
+
|
|
162
224
|
if not downloaded_data:
|
|
163
225
|
return
|
|
164
226
|
|
|
@@ -179,9 +241,14 @@ class Pipeline:
|
|
|
179
241
|
self.uploader_step(iterable=elements)
|
|
180
242
|
|
|
181
243
|
def __str__(self):
|
|
182
|
-
s = [str(self.indexer_step)
|
|
244
|
+
s = [str(self.indexer_step)]
|
|
245
|
+
if filter_step := self.filter_step:
|
|
246
|
+
s.append(str(filter_step))
|
|
247
|
+
s.append(str(self.downloader_step))
|
|
248
|
+
if filter_step := self.filter_step:
|
|
249
|
+
s.append(str(filter_step))
|
|
183
250
|
if uncompress_step := self.uncompress_step:
|
|
184
|
-
s.
|
|
251
|
+
s.extend([str(uncompress_step), str(filter_step)])
|
|
185
252
|
s.append(str(self.partitioner_step))
|
|
186
253
|
if chunker_step := self.chunker_step:
|
|
187
254
|
s.append(str(chunker_step))
|
|
@@ -200,6 +267,7 @@ class Pipeline:
|
|
|
200
267
|
downloader_config: DownloaderConfigT,
|
|
201
268
|
source_connection_config: ConnectionConfig,
|
|
202
269
|
partitioner_config: PartitionerConfig,
|
|
270
|
+
filterer_config: FiltererConfig = None,
|
|
203
271
|
chunker_config: Optional[ChunkerConfig] = None,
|
|
204
272
|
embedder_config: Optional[EmbedderConfig] = None,
|
|
205
273
|
destination_connection_config: Optional[ConnectionConfig] = None,
|
|
@@ -235,6 +303,8 @@ class Pipeline:
|
|
|
235
303
|
),
|
|
236
304
|
"partitioner": Partitioner(config=partitioner_config),
|
|
237
305
|
}
|
|
306
|
+
if filterer_config:
|
|
307
|
+
pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
|
|
238
308
|
if chunker_config:
|
|
239
309
|
pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
|
|
240
310
|
if embedder_config:
|
|
@@ -2,6 +2,7 @@ import asyncio
|
|
|
2
2
|
import hashlib
|
|
3
3
|
import json
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
5
6
|
from typing import Callable, Optional, TypedDict, TypeVar
|
|
6
7
|
|
|
7
8
|
from unstructured_ingest.v2.interfaces import FileData, download_responses
|
|
@@ -70,11 +71,40 @@ class DownloadStep(PipelineStep):
|
|
|
70
71
|
return True
|
|
71
72
|
return False
|
|
72
73
|
|
|
74
|
+
def update_file_data(
|
|
75
|
+
self, file_data: FileData, file_data_path: Path, download_path: Path
|
|
76
|
+
) -> None:
|
|
77
|
+
file_size_bytes = download_path.stat().st_size
|
|
78
|
+
changed = False
|
|
79
|
+
if not file_data.metadata.filesize_bytes and file_size_bytes:
|
|
80
|
+
changed = True
|
|
81
|
+
file_data.metadata.filesize_bytes = file_size_bytes
|
|
82
|
+
if (
|
|
83
|
+
file_data.metadata.filesize_bytes
|
|
84
|
+
and file_data.metadata.filesize_bytes != file_size_bytes
|
|
85
|
+
):
|
|
86
|
+
logger.warning(
|
|
87
|
+
f"file size in original file data "
|
|
88
|
+
f"({file_data.metadata.filesize_bytes}) doesn't "
|
|
89
|
+
f"match size of local file: {file_size_bytes}, updating"
|
|
90
|
+
)
|
|
91
|
+
changed = True
|
|
92
|
+
file_data.metadata.filesize_bytes = file_size_bytes
|
|
93
|
+
if changed:
|
|
94
|
+
logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
|
|
95
|
+
with file_data_path.open("w") as file:
|
|
96
|
+
json.dump(file_data.to_dict(), file, indent=2)
|
|
97
|
+
|
|
73
98
|
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
74
99
|
file_data = FileData.from_file(path=file_data_path)
|
|
75
100
|
download_path = self.process.get_download_path(file_data=file_data)
|
|
76
101
|
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
77
102
|
logger.debug(f"Skipping download, file already exists locally: {download_path}")
|
|
103
|
+
self.update_file_data(
|
|
104
|
+
file_data=file_data,
|
|
105
|
+
file_data_path=Path(file_data_path),
|
|
106
|
+
download_path=download_path,
|
|
107
|
+
)
|
|
78
108
|
return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
|
|
79
109
|
fn_kwargs = {"file_data": file_data}
|
|
80
110
|
if not asyncio.iscoroutinefunction(fn):
|
|
@@ -85,26 +115,60 @@ class DownloadStep(PipelineStep):
|
|
|
85
115
|
else:
|
|
86
116
|
download_results = await fn(**fn_kwargs)
|
|
87
117
|
return self.create_step_results(
|
|
88
|
-
current_file_data_path=file_data_path,
|
|
118
|
+
current_file_data_path=file_data_path,
|
|
119
|
+
download_results=download_results,
|
|
120
|
+
current_file_data=file_data,
|
|
89
121
|
)
|
|
90
122
|
|
|
91
123
|
def create_step_results(
|
|
92
|
-
self,
|
|
124
|
+
self,
|
|
125
|
+
current_file_data_path: str,
|
|
126
|
+
current_file_data: FileData,
|
|
127
|
+
download_results: download_responses,
|
|
93
128
|
) -> list[DownloadStepResponse]:
|
|
129
|
+
responses = []
|
|
94
130
|
if not isinstance(download_results, list):
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
131
|
+
file_data = current_file_data
|
|
132
|
+
file_data_path = current_file_data_path
|
|
133
|
+
download_path = download_results["path"]
|
|
134
|
+
if download_results["file_data"].identifier == current_file_data.identifier:
|
|
135
|
+
self.update_file_data(
|
|
136
|
+
file_data=file_data,
|
|
137
|
+
file_data_path=Path(file_data_path),
|
|
138
|
+
download_path=download_path,
|
|
139
|
+
)
|
|
140
|
+
responses = [
|
|
141
|
+
DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
|
|
142
|
+
]
|
|
143
|
+
else:
|
|
144
|
+
file_data = download_results["file_data"]
|
|
145
|
+
file_data_path = self.persist_new_file_data(file_data=file_data)
|
|
146
|
+
self.update_file_data(
|
|
147
|
+
file_data=file_data,
|
|
148
|
+
file_data_path=Path(file_data_path),
|
|
149
|
+
download_path=download_path,
|
|
98
150
|
)
|
|
99
|
-
|
|
151
|
+
responses = [
|
|
152
|
+
DownloadStepResponse(
|
|
153
|
+
file_data_path=current_file_data_path, path=str(download_results["path"])
|
|
154
|
+
)
|
|
155
|
+
]
|
|
156
|
+
else:
|
|
100
157
|
# Supplemental results generated as part of the download process
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
158
|
+
for res in download_results:
|
|
159
|
+
file_data = res["file_data"]
|
|
160
|
+
file_data_path = self.persist_new_file_data(file_data=file_data)
|
|
161
|
+
download_path = res["path"]
|
|
162
|
+
self.update_file_data(
|
|
163
|
+
file_data=file_data,
|
|
164
|
+
file_data_path=Path(file_data_path),
|
|
165
|
+
download_path=download_path,
|
|
166
|
+
)
|
|
167
|
+
responses.append(
|
|
168
|
+
DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
return responses
|
|
108
172
|
|
|
109
173
|
def persist_new_file_data(self, file_data: FileData) -> str:
|
|
110
174
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Callable, Optional
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
6
|
+
from unstructured_ingest.v2.logger import logger
|
|
7
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
8
|
+
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
9
|
+
from unstructured_ingest.v2.processes.filter import Filterer
|
|
10
|
+
|
|
11
|
+
STEP_ID = "filter"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class FilterStep(PipelineStep):
|
|
16
|
+
process: Filterer
|
|
17
|
+
identifier: str = STEP_ID
|
|
18
|
+
|
|
19
|
+
def __post_init__(self):
|
|
20
|
+
config = (
|
|
21
|
+
sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
|
|
22
|
+
if self.process.config
|
|
23
|
+
else None
|
|
24
|
+
)
|
|
25
|
+
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
26
|
+
|
|
27
|
+
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
28
|
+
file_data = FileData.from_file(path=file_data_path)
|
|
29
|
+
fn_kwargs = {"file_data": file_data}
|
|
30
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
31
|
+
resp = fn(**fn_kwargs)
|
|
32
|
+
elif semaphore := self.context.semaphore:
|
|
33
|
+
async with semaphore:
|
|
34
|
+
resp = await fn(**fn_kwargs)
|
|
35
|
+
else:
|
|
36
|
+
resp = await fn(**fn_kwargs)
|
|
37
|
+
|
|
38
|
+
if resp:
|
|
39
|
+
return {"file_data_path": file_data_path}
|
|
40
|
+
return None
|
|
@@ -7,6 +7,7 @@ from unstructured import __name__ as integration_name
|
|
|
7
7
|
from unstructured.__version__ import __version__ as integration_version
|
|
8
8
|
|
|
9
9
|
from unstructured_ingest.enhanced_dataclass import enhanced_field
|
|
10
|
+
from unstructured_ingest.error import DestinationConnectionError
|
|
10
11
|
from unstructured_ingest.utils.data_prep import batch_generator
|
|
11
12
|
from unstructured_ingest.utils.dep_check import requires_dependencies
|
|
12
13
|
from unstructured_ingest.v2.interfaces import (
|
|
@@ -94,6 +95,13 @@ class AstraUploader(Uploader):
|
|
|
94
95
|
upload_config: AstraUploaderConfig
|
|
95
96
|
connector_type: str = CONNECTOR_TYPE
|
|
96
97
|
|
|
98
|
+
def precheck(self) -> None:
|
|
99
|
+
try:
|
|
100
|
+
self.get_collection()
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.error(f"Failed to validate connection {e}", exc_info=True)
|
|
103
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
104
|
+
|
|
97
105
|
@requires_dependencies(["astrapy"], extras="astra")
|
|
98
106
|
def get_collection(self) -> "AstraDBCollection":
|
|
99
107
|
from astrapy.db import AstraDB
|
|
@@ -175,6 +175,14 @@ class AzureCognitiveSearchUploader(Uploader):
|
|
|
175
175
|
),
|
|
176
176
|
)
|
|
177
177
|
|
|
178
|
+
def precheck(self) -> None:
|
|
179
|
+
try:
|
|
180
|
+
client = self.connection_config.generate_client()
|
|
181
|
+
client.get_document_count()
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
184
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
185
|
+
|
|
178
186
|
def write_dict_wrapper(self, elements_dict):
|
|
179
187
|
return self.write_dict(elements_dict=elements_dict)
|
|
180
188
|
|
|
@@ -111,10 +111,13 @@ class ChromaUploader(Uploader):
|
|
|
111
111
|
connector_type: str = CONNECTOR_TYPE
|
|
112
112
|
upload_config: ChromaUploaderConfig
|
|
113
113
|
connection_config: ChromaConnectionConfig
|
|
114
|
-
client: Optional["Client"] = field(init=False)
|
|
115
114
|
|
|
116
|
-
def
|
|
117
|
-
|
|
115
|
+
def precheck(self) -> None:
|
|
116
|
+
try:
|
|
117
|
+
self.create_client()
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
|
120
|
+
raise DestinationConnectionError(f"failed to validate connection: {e}")
|
|
118
121
|
|
|
119
122
|
@requires_dependencies(["chromadb"], extras="chroma")
|
|
120
123
|
def create_client(self) -> "Client":
|
|
@@ -187,10 +190,9 @@ class ChromaUploader(Uploader):
|
|
|
187
190
|
f"collection {self.connection_config.collection_name} "
|
|
188
191
|
f"at {self.connection_config.host}",
|
|
189
192
|
)
|
|
193
|
+
client = self.create_client()
|
|
190
194
|
|
|
191
|
-
collection =
|
|
192
|
-
name=self.connection_config.collection_name
|
|
193
|
-
)
|
|
195
|
+
collection = client.get_or_create_collection(name=self.connection_config.collection_name)
|
|
194
196
|
for chunk in batch_generator(elements_dict, self.upload_config.batch_size):
|
|
195
197
|
self.upsert_batch(collection, self.prepare_chroma_list(chunk))
|
|
196
198
|
|