unstructured-ingest 0.0.2.dev0__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/cli.py +6 -1
- unstructured_ingest/cli/cmds/__init__.py +4 -4
- unstructured_ingest/cli/cmds/{astra.py → astradb.py} +9 -9
- unstructured_ingest/cli/interfaces.py +13 -6
- unstructured_ingest/connector/{astra.py → astradb.py} +29 -29
- unstructured_ingest/connector/biomed.py +12 -5
- unstructured_ingest/connector/confluence.py +3 -3
- unstructured_ingest/connector/github.py +3 -2
- unstructured_ingest/connector/google_drive.py +1 -2
- unstructured_ingest/connector/mongodb.py +1 -2
- unstructured_ingest/connector/notion/client.py +31 -16
- unstructured_ingest/connector/notion/connector.py +3 -2
- unstructured_ingest/connector/registry.py +2 -2
- unstructured_ingest/connector/vectara.py +7 -2
- unstructured_ingest/interfaces.py +13 -9
- unstructured_ingest/pipeline/interfaces.py +8 -3
- unstructured_ingest/pipeline/reformat/chunking.py +13 -9
- unstructured_ingest/pipeline/reformat/embedding.py +3 -3
- unstructured_ingest/runner/__init__.py +2 -2
- unstructured_ingest/runner/{astra.py → astradb.py} +7 -7
- unstructured_ingest/runner/writers/__init__.py +2 -2
- unstructured_ingest/runner/writers/{astra.py → astradb.py} +7 -7
- unstructured_ingest/utils/chunking.py +45 -0
- unstructured_ingest/utils/dep_check.py +1 -1
- unstructured_ingest/utils/google_filetype.py +9 -0
- unstructured_ingest/v2/cli/base/cmd.py +66 -12
- unstructured_ingest/v2/cli/base/dest.py +21 -12
- unstructured_ingest/v2/cli/base/src.py +35 -21
- unstructured_ingest/v2/cli/cmds.py +14 -0
- unstructured_ingest/v2/cli/{utils.py → utils/click.py} +36 -89
- unstructured_ingest/v2/cli/utils/model_conversion.py +199 -0
- unstructured_ingest/v2/interfaces/__init__.py +2 -1
- unstructured_ingest/v2/interfaces/connector.py +5 -7
- unstructured_ingest/v2/interfaces/downloader.py +17 -8
- unstructured_ingest/v2/interfaces/file_data.py +13 -2
- unstructured_ingest/v2/interfaces/indexer.py +3 -4
- unstructured_ingest/v2/interfaces/process.py +3 -4
- unstructured_ingest/v2/interfaces/processor.py +10 -10
- unstructured_ingest/v2/interfaces/upload_stager.py +3 -3
- unstructured_ingest/v2/interfaces/uploader.py +3 -3
- unstructured_ingest/v2/pipeline/interfaces.py +3 -5
- unstructured_ingest/v2/pipeline/pipeline.py +73 -7
- unstructured_ingest/v2/pipeline/steps/chunk.py +5 -11
- unstructured_ingest/v2/pipeline/steps/download.py +90 -24
- unstructured_ingest/v2/pipeline/steps/embed.py +5 -11
- unstructured_ingest/v2/pipeline/steps/filter.py +35 -0
- unstructured_ingest/v2/pipeline/steps/index.py +14 -10
- unstructured_ingest/v2/pipeline/steps/partition.py +5 -5
- unstructured_ingest/v2/pipeline/steps/stage.py +4 -7
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -6
- unstructured_ingest/v2/pipeline/steps/upload.py +2 -9
- unstructured_ingest/v2/processes/__init__.py +18 -0
- unstructured_ingest/v2/processes/chunker.py +74 -28
- unstructured_ingest/v2/processes/connector_registry.py +8 -2
- unstructured_ingest/v2/processes/connectors/__init__.py +13 -3
- unstructured_ingest/v2/processes/connectors/{astra.py → astradb.py} +53 -35
- unstructured_ingest/v2/processes/connectors/azure_cognitive_search.py +38 -27
- unstructured_ingest/v2/processes/connectors/chroma.py +38 -27
- unstructured_ingest/v2/processes/connectors/couchbase.py +151 -0
- unstructured_ingest/v2/processes/connectors/databricks_volumes.py +95 -31
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +92 -53
- unstructured_ingest/v2/processes/connectors/fsspec/azure.py +47 -16
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +23 -13
- unstructured_ingest/v2/processes/connectors/fsspec/dropbox.py +18 -11
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +49 -61
- unstructured_ingest/v2/processes/connectors/fsspec/gcs.py +46 -13
- unstructured_ingest/v2/processes/connectors/fsspec/s3.py +50 -20
- unstructured_ingest/v2/processes/connectors/fsspec/sftp.py +27 -28
- unstructured_ingest/v2/processes/connectors/google_drive.py +52 -42
- unstructured_ingest/v2/processes/connectors/local.py +36 -28
- unstructured_ingest/v2/processes/connectors/milvus.py +22 -18
- unstructured_ingest/v2/processes/connectors/mongodb.py +32 -22
- unstructured_ingest/v2/processes/connectors/onedrive.py +31 -16
- unstructured_ingest/v2/processes/connectors/opensearch.py +81 -43
- unstructured_ingest/v2/processes/connectors/pinecone.py +29 -23
- unstructured_ingest/v2/processes/connectors/salesforce.py +36 -26
- unstructured_ingest/v2/processes/connectors/sharepoint.py +64 -33
- unstructured_ingest/v2/processes/connectors/singlestore.py +11 -15
- unstructured_ingest/v2/processes/connectors/sql.py +52 -39
- unstructured_ingest/v2/processes/connectors/weaviate.py +35 -18
- unstructured_ingest/v2/processes/embedder.py +106 -47
- unstructured_ingest/v2/processes/filter.py +60 -0
- unstructured_ingest/v2/processes/partitioner.py +79 -33
- unstructured_ingest/v2/processes/uncompress.py +3 -3
- unstructured_ingest/v2/utils.py +45 -0
- unstructured_ingest-0.0.4.dist-info/METADATA +571 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/RECORD +92 -116
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/WHEEL +1 -1
- unstructured_ingest/v2/cli/cmds/__init__.py +0 -89
- unstructured_ingest/v2/cli/cmds/astra.py +0 -85
- unstructured_ingest/v2/cli/cmds/azure_cognitive_search.py +0 -72
- unstructured_ingest/v2/cli/cmds/chroma.py +0 -108
- unstructured_ingest/v2/cli/cmds/databricks_volumes.py +0 -161
- unstructured_ingest/v2/cli/cmds/elasticsearch.py +0 -159
- unstructured_ingest/v2/cli/cmds/fsspec/azure.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/box.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/dropbox.py +0 -58
- unstructured_ingest/v2/cli/cmds/fsspec/fsspec.py +0 -77
- unstructured_ingest/v2/cli/cmds/fsspec/gcs.py +0 -81
- unstructured_ingest/v2/cli/cmds/fsspec/s3.py +0 -84
- unstructured_ingest/v2/cli/cmds/fsspec/sftp.py +0 -80
- unstructured_ingest/v2/cli/cmds/google_drive.py +0 -74
- unstructured_ingest/v2/cli/cmds/local.py +0 -60
- unstructured_ingest/v2/cli/cmds/milvus.py +0 -72
- unstructured_ingest/v2/cli/cmds/mongodb.py +0 -62
- unstructured_ingest/v2/cli/cmds/onedrive.py +0 -91
- unstructured_ingest/v2/cli/cmds/opensearch.py +0 -93
- unstructured_ingest/v2/cli/cmds/pinecone.py +0 -62
- unstructured_ingest/v2/cli/cmds/salesforce.py +0 -79
- unstructured_ingest/v2/cli/cmds/sharepoint.py +0 -112
- unstructured_ingest/v2/cli/cmds/singlestore.py +0 -96
- unstructured_ingest/v2/cli/cmds/sql.py +0 -84
- unstructured_ingest/v2/cli/cmds/weaviate.py +0 -100
- unstructured_ingest/v2/cli/configs/__init__.py +0 -6
- unstructured_ingest/v2/cli/configs/chunk.py +0 -89
- unstructured_ingest/v2/cli/configs/embed.py +0 -74
- unstructured_ingest/v2/cli/configs/partition.py +0 -99
- unstructured_ingest/v2/cli/configs/processor.py +0 -88
- unstructured_ingest/v2/cli/interfaces.py +0 -27
- unstructured_ingest/v2/pipeline/utils.py +0 -15
- unstructured_ingest-0.0.2.dev0.dist-info/METADATA +0 -321
- /unstructured_ingest/v2/cli/{cmds/fsspec → utils}/__init__.py +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.2.dev0.dist-info → unstructured_ingest-0.0.4.dist-info}/top_level.txt +0 -0
|
@@ -9,12 +9,12 @@ from unstructured_ingest.v2.logger import logger, make_default_logger
|
|
|
9
9
|
from unstructured_ingest.v2.pipeline.steps.chunk import Chunker, ChunkStep
|
|
10
10
|
from unstructured_ingest.v2.pipeline.steps.download import DownloaderT, DownloadStep
|
|
11
11
|
from unstructured_ingest.v2.pipeline.steps.embed import Embedder, EmbedStep
|
|
12
|
+
from unstructured_ingest.v2.pipeline.steps.filter import Filterer, FilterStep
|
|
12
13
|
from unstructured_ingest.v2.pipeline.steps.index import IndexerT, IndexStep
|
|
13
14
|
from unstructured_ingest.v2.pipeline.steps.partition import Partitioner, PartitionStep
|
|
14
15
|
from unstructured_ingest.v2.pipeline.steps.stage import UploadStager, UploadStageStep
|
|
15
16
|
from unstructured_ingest.v2.pipeline.steps.uncompress import Uncompressor, UncompressStep
|
|
16
17
|
from unstructured_ingest.v2.pipeline.steps.upload import Uploader, UploadStep
|
|
17
|
-
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
18
18
|
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
|
|
19
19
|
from unstructured_ingest.v2.processes.connector_registry import (
|
|
20
20
|
ConnectionConfig,
|
|
@@ -27,6 +27,7 @@ from unstructured_ingest.v2.processes.connector_registry import (
|
|
|
27
27
|
)
|
|
28
28
|
from unstructured_ingest.v2.processes.connectors.local import LocalUploader
|
|
29
29
|
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
|
|
30
|
+
from unstructured_ingest.v2.processes.filter import FiltererConfig
|
|
30
31
|
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
|
|
31
32
|
|
|
32
33
|
|
|
@@ -37,22 +38,33 @@ class PipelineError(Exception):
|
|
|
37
38
|
@dataclass
|
|
38
39
|
class Pipeline:
|
|
39
40
|
context: ProcessorConfig
|
|
41
|
+
|
|
40
42
|
indexer: InitVar[IndexerT]
|
|
41
43
|
indexer_step: IndexStep = field(init=False)
|
|
44
|
+
|
|
42
45
|
downloader: InitVar[DownloaderT]
|
|
43
46
|
downloader_step: DownloadStep = field(init=False)
|
|
47
|
+
|
|
44
48
|
partitioner: InitVar[Partitioner]
|
|
45
49
|
partitioner_step: PartitionStep = field(init=False)
|
|
50
|
+
|
|
46
51
|
chunker: InitVar[Optional[Chunker]] = None
|
|
47
52
|
chunker_step: ChunkStep = field(init=False, default=None)
|
|
53
|
+
|
|
48
54
|
embedder: InitVar[Optional[Embedder]] = None
|
|
49
55
|
embedder_step: EmbedStep = field(init=False, default=None)
|
|
56
|
+
|
|
50
57
|
stager: InitVar[Optional[UploadStager]] = None
|
|
51
58
|
stager_step: UploadStageStep = field(init=False, default=None)
|
|
59
|
+
|
|
52
60
|
uploader: InitVar[Uploader] = field(default=LocalUploader())
|
|
53
61
|
uploader_step: UploadStep = field(init=False, default=None)
|
|
62
|
+
|
|
54
63
|
uncompress_step: UncompressStep = field(init=False, default=None)
|
|
55
64
|
|
|
65
|
+
filterer: InitVar[Optional[Filterer]] = None
|
|
66
|
+
filter_step: FilterStep = field(init=False, default=None)
|
|
67
|
+
|
|
56
68
|
def __post_init__(
|
|
57
69
|
self,
|
|
58
70
|
indexer: IndexerT,
|
|
@@ -62,10 +74,12 @@ class Pipeline:
|
|
|
62
74
|
embedder: Embedder = None,
|
|
63
75
|
stager: UploadStager = None,
|
|
64
76
|
uploader: Uploader = None,
|
|
77
|
+
filterer: Filterer = None,
|
|
65
78
|
):
|
|
66
79
|
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
|
|
67
80
|
self.indexer_step = IndexStep(process=indexer, context=self.context)
|
|
68
81
|
self.downloader_step = DownloadStep(process=downloader, context=self.context)
|
|
82
|
+
self.filter_step = FilterStep(process=filterer, context=self.context) if filterer else None
|
|
69
83
|
self.partitioner_step = PartitionStep(process=partitioner, context=self.context)
|
|
70
84
|
self.chunker_step = ChunkStep(process=chunker, context=self.context) if chunker else None
|
|
71
85
|
|
|
@@ -109,6 +123,7 @@ class Pipeline:
|
|
|
109
123
|
def run(self):
|
|
110
124
|
try:
|
|
111
125
|
start_time = time()
|
|
126
|
+
self._run_prechecks()
|
|
112
127
|
self._run()
|
|
113
128
|
logger.info(f"Finished ingest process in {time() - start_time}s")
|
|
114
129
|
finally:
|
|
@@ -130,11 +145,39 @@ class Pipeline:
|
|
|
130
145
|
final = [f for f in flat if f]
|
|
131
146
|
return final or None
|
|
132
147
|
|
|
148
|
+
def _run_prechecks(self):
|
|
149
|
+
steps = [self.indexer_step, self.downloader_step, self.partitioner_step, self.uploader_step]
|
|
150
|
+
if self.chunker_step:
|
|
151
|
+
steps.append(self.chunker_step)
|
|
152
|
+
if self.embedder_step:
|
|
153
|
+
steps.append(self.embedder_step)
|
|
154
|
+
if self.uncompress_step:
|
|
155
|
+
steps.append(self.uncompress_step)
|
|
156
|
+
if self.stager_step:
|
|
157
|
+
steps.append(self.stager_step)
|
|
158
|
+
failures = {}
|
|
159
|
+
for step in steps:
|
|
160
|
+
try:
|
|
161
|
+
step.process.precheck()
|
|
162
|
+
except Exception as e:
|
|
163
|
+
failures[step.process.__class__.__name__] = f"[{type(e).__name__}] {e}"
|
|
164
|
+
if failures:
|
|
165
|
+
for k, v in failures.items():
|
|
166
|
+
logger.error(f"Step precheck failure: {k}: {v}")
|
|
167
|
+
raise PipelineError("Precheck failed")
|
|
168
|
+
|
|
169
|
+
def apply_filter(self, records: list[dict]) -> list[dict]:
|
|
170
|
+
if not self.filter_step:
|
|
171
|
+
return records
|
|
172
|
+
data_to_filter = [{"file_data_path": i["file_data_path"]} for i in records]
|
|
173
|
+
filtered_data = self.filter_step(data_to_filter)
|
|
174
|
+
filtered_data = [f for f in filtered_data if f is not None]
|
|
175
|
+
filtered_file_data_paths = [r["file_data_path"] for r in filtered_data]
|
|
176
|
+
filtered_records = [r for r in records if r["file_data_path"] in filtered_file_data_paths]
|
|
177
|
+
return filtered_records
|
|
178
|
+
|
|
133
179
|
def _run(self):
|
|
134
|
-
logger.info(
|
|
135
|
-
f"Running local pipline: {self} with configs: "
|
|
136
|
-
f"{sterilize_dict(self.context.to_dict(redact_sensitive=True))}"
|
|
137
|
-
)
|
|
180
|
+
logger.info(f"Running local pipline: {self} with configs: " f"{self.context.json()}")
|
|
138
181
|
if self.context.mp_supported:
|
|
139
182
|
manager = mp.Manager()
|
|
140
183
|
self.context.status = manager.dict()
|
|
@@ -147,18 +190,33 @@ class Pipeline:
|
|
|
147
190
|
if not indices_inputs:
|
|
148
191
|
return
|
|
149
192
|
|
|
193
|
+
# Initial filtering on indexed content
|
|
194
|
+
indices_inputs = self.apply_filter(records=indices_inputs)
|
|
195
|
+
if not indices_inputs:
|
|
196
|
+
return
|
|
197
|
+
|
|
150
198
|
# Download associated content to local file system
|
|
151
199
|
downloaded_data = self.downloader_step(indices_inputs)
|
|
152
200
|
downloaded_data = self.clean_results(results=downloaded_data)
|
|
153
201
|
if not downloaded_data:
|
|
154
202
|
return
|
|
155
203
|
|
|
204
|
+
# Post download filtering
|
|
205
|
+
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
206
|
+
if not downloaded_data:
|
|
207
|
+
return
|
|
208
|
+
|
|
156
209
|
# Run uncompress if available
|
|
157
210
|
if self.uncompress_step:
|
|
158
211
|
downloaded_data = self.uncompress_step(downloaded_data)
|
|
159
212
|
# Flatten list of lists
|
|
160
213
|
downloaded_data = self.clean_results(results=downloaded_data)
|
|
161
214
|
|
|
215
|
+
# Post uncompress filtering
|
|
216
|
+
downloaded_data = self.apply_filter(records=downloaded_data)
|
|
217
|
+
if not downloaded_data:
|
|
218
|
+
return
|
|
219
|
+
|
|
162
220
|
if not downloaded_data:
|
|
163
221
|
return
|
|
164
222
|
|
|
@@ -179,9 +237,14 @@ class Pipeline:
|
|
|
179
237
|
self.uploader_step(iterable=elements)
|
|
180
238
|
|
|
181
239
|
def __str__(self):
|
|
182
|
-
s = [str(self.indexer_step)
|
|
240
|
+
s = [str(self.indexer_step)]
|
|
241
|
+
if filter_step := self.filter_step:
|
|
242
|
+
s.append(str(filter_step))
|
|
243
|
+
s.append(str(self.downloader_step))
|
|
244
|
+
if filter_step := self.filter_step:
|
|
245
|
+
s.append(str(filter_step))
|
|
183
246
|
if uncompress_step := self.uncompress_step:
|
|
184
|
-
s.
|
|
247
|
+
s.extend([str(uncompress_step), str(filter_step)])
|
|
185
248
|
s.append(str(self.partitioner_step))
|
|
186
249
|
if chunker_step := self.chunker_step:
|
|
187
250
|
s.append(str(chunker_step))
|
|
@@ -200,6 +263,7 @@ class Pipeline:
|
|
|
200
263
|
downloader_config: DownloaderConfigT,
|
|
201
264
|
source_connection_config: ConnectionConfig,
|
|
202
265
|
partitioner_config: PartitionerConfig,
|
|
266
|
+
filterer_config: FiltererConfig = None,
|
|
203
267
|
chunker_config: Optional[ChunkerConfig] = None,
|
|
204
268
|
embedder_config: Optional[EmbedderConfig] = None,
|
|
205
269
|
destination_connection_config: Optional[ConnectionConfig] = None,
|
|
@@ -235,6 +299,8 @@ class Pipeline:
|
|
|
235
299
|
),
|
|
236
300
|
"partitioner": Partitioner(config=partitioner_config),
|
|
237
301
|
}
|
|
302
|
+
if filterer_config:
|
|
303
|
+
pipeline_kwargs["filterer"] = Filterer(config=filterer_config)
|
|
238
304
|
if chunker_config:
|
|
239
305
|
pipeline_kwargs["chunker"] = Chunker(config=chunker_config)
|
|
240
306
|
if embedder_config:
|
|
@@ -5,13 +5,11 @@ from dataclasses import dataclass
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
|
-
from unstructured.staging.base import elements_to_dicts
|
|
9
|
-
|
|
10
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
11
9
|
from unstructured_ingest.v2.logger import logger
|
|
12
10
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
13
|
-
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
14
11
|
from unstructured_ingest.v2.processes.chunker import Chunker
|
|
12
|
+
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
15
13
|
|
|
16
14
|
STEP_ID = "chunk"
|
|
17
15
|
|
|
@@ -30,11 +28,7 @@ class ChunkStep(PipelineStep):
|
|
|
30
28
|
return f"{self.identifier} ({self.process.config.chunking_strategy})"
|
|
31
29
|
|
|
32
30
|
def __post_init__(self):
|
|
33
|
-
config = (
|
|
34
|
-
sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
|
|
35
|
-
if self.process.config
|
|
36
|
-
else None
|
|
37
|
-
)
|
|
31
|
+
config = self.process.config.json() if self.process.config else None
|
|
38
32
|
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
39
33
|
|
|
40
34
|
def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
|
|
@@ -72,13 +66,13 @@ class ChunkStep(PipelineStep):
|
|
|
72
66
|
chunked_content_raw = await fn(**fn_kwargs)
|
|
73
67
|
self._save_output(
|
|
74
68
|
output_filepath=str(output_filepath),
|
|
75
|
-
chunked_content=
|
|
69
|
+
chunked_content=chunked_content_raw,
|
|
76
70
|
)
|
|
77
71
|
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
78
72
|
|
|
79
73
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
80
|
-
hashable_string =
|
|
81
|
-
self.process.config
|
|
74
|
+
hashable_string = serialize_base_model_json(
|
|
75
|
+
model=self.process.config, sort_keys=True, ensure_ascii=True
|
|
82
76
|
)
|
|
83
77
|
if extras:
|
|
84
78
|
hashable_string += "".join(extras)
|
|
@@ -2,13 +2,14 @@ import asyncio
|
|
|
2
2
|
import hashlib
|
|
3
3
|
import json
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
5
6
|
from typing import Callable, Optional, TypedDict, TypeVar
|
|
6
7
|
|
|
7
8
|
from unstructured_ingest.v2.interfaces import FileData, download_responses
|
|
8
9
|
from unstructured_ingest.v2.interfaces.downloader import Downloader
|
|
9
10
|
from unstructured_ingest.v2.logger import logger
|
|
10
11
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
|
-
from unstructured_ingest.v2.
|
|
12
|
+
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
12
13
|
|
|
13
14
|
DownloaderT = TypeVar("DownloaderT", bound=Downloader)
|
|
14
15
|
|
|
@@ -29,15 +30,9 @@ class DownloadStep(PipelineStep):
|
|
|
29
30
|
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
30
31
|
|
|
31
32
|
def __post_init__(self):
|
|
32
|
-
config = (
|
|
33
|
-
sterilize_dict(self.process.download_config.to_dict(redact_sensitive=True))
|
|
34
|
-
if self.process.download_config
|
|
35
|
-
else None
|
|
36
|
-
)
|
|
33
|
+
config = self.process.download_config.json() if self.process.download_config else None
|
|
37
34
|
connection_config = (
|
|
38
|
-
|
|
39
|
-
if self.process.connection_config
|
|
40
|
-
else None
|
|
35
|
+
self.process.connection_config.json() if self.process.connection_config else None
|
|
41
36
|
)
|
|
42
37
|
logger.info(
|
|
43
38
|
f"Created {self.identifier} with configs: {config}, "
|
|
@@ -70,11 +65,40 @@ class DownloadStep(PipelineStep):
|
|
|
70
65
|
return True
|
|
71
66
|
return False
|
|
72
67
|
|
|
68
|
+
def update_file_data(
|
|
69
|
+
self, file_data: FileData, file_data_path: Path, download_path: Path
|
|
70
|
+
) -> None:
|
|
71
|
+
file_size_bytes = download_path.stat().st_size
|
|
72
|
+
changed = False
|
|
73
|
+
if not file_data.metadata.filesize_bytes and file_size_bytes:
|
|
74
|
+
changed = True
|
|
75
|
+
file_data.metadata.filesize_bytes = file_size_bytes
|
|
76
|
+
if (
|
|
77
|
+
file_data.metadata.filesize_bytes
|
|
78
|
+
and file_data.metadata.filesize_bytes != file_size_bytes
|
|
79
|
+
):
|
|
80
|
+
logger.warning(
|
|
81
|
+
f"file size in original file data "
|
|
82
|
+
f"({file_data.metadata.filesize_bytes}) doesn't "
|
|
83
|
+
f"match size of local file: {file_size_bytes}, updating"
|
|
84
|
+
)
|
|
85
|
+
changed = True
|
|
86
|
+
file_data.metadata.filesize_bytes = file_size_bytes
|
|
87
|
+
if changed:
|
|
88
|
+
logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
|
|
89
|
+
with file_data_path.open("w") as file:
|
|
90
|
+
json.dump(file_data.to_dict(), file, indent=2)
|
|
91
|
+
|
|
73
92
|
async def _run_async(self, fn: Callable, file_data_path: str) -> list[DownloadStepResponse]:
|
|
74
93
|
file_data = FileData.from_file(path=file_data_path)
|
|
75
94
|
download_path = self.process.get_download_path(file_data=file_data)
|
|
76
95
|
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
77
96
|
logger.debug(f"Skipping download, file already exists locally: {download_path}")
|
|
97
|
+
self.update_file_data(
|
|
98
|
+
file_data=file_data,
|
|
99
|
+
file_data_path=Path(file_data_path),
|
|
100
|
+
download_path=download_path,
|
|
101
|
+
)
|
|
78
102
|
return [DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))]
|
|
79
103
|
fn_kwargs = {"file_data": file_data}
|
|
80
104
|
if not asyncio.iscoroutinefunction(fn):
|
|
@@ -85,26 +109,60 @@ class DownloadStep(PipelineStep):
|
|
|
85
109
|
else:
|
|
86
110
|
download_results = await fn(**fn_kwargs)
|
|
87
111
|
return self.create_step_results(
|
|
88
|
-
current_file_data_path=file_data_path,
|
|
112
|
+
current_file_data_path=file_data_path,
|
|
113
|
+
download_results=download_results,
|
|
114
|
+
current_file_data=file_data,
|
|
89
115
|
)
|
|
90
116
|
|
|
91
117
|
def create_step_results(
|
|
92
|
-
self,
|
|
118
|
+
self,
|
|
119
|
+
current_file_data_path: str,
|
|
120
|
+
current_file_data: FileData,
|
|
121
|
+
download_results: download_responses,
|
|
93
122
|
) -> list[DownloadStepResponse]:
|
|
123
|
+
responses = []
|
|
94
124
|
if not isinstance(download_results, list):
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
125
|
+
file_data = current_file_data
|
|
126
|
+
file_data_path = current_file_data_path
|
|
127
|
+
download_path = download_results["path"]
|
|
128
|
+
if download_results["file_data"].identifier == current_file_data.identifier:
|
|
129
|
+
self.update_file_data(
|
|
130
|
+
file_data=file_data,
|
|
131
|
+
file_data_path=Path(file_data_path),
|
|
132
|
+
download_path=download_path,
|
|
133
|
+
)
|
|
134
|
+
responses = [
|
|
135
|
+
DownloadStepResponse(file_data_path=file_data_path, path=str(download_path))
|
|
136
|
+
]
|
|
137
|
+
else:
|
|
138
|
+
file_data = download_results["file_data"]
|
|
139
|
+
file_data_path = self.persist_new_file_data(file_data=file_data)
|
|
140
|
+
self.update_file_data(
|
|
141
|
+
file_data=file_data,
|
|
142
|
+
file_data_path=Path(file_data_path),
|
|
143
|
+
download_path=download_path,
|
|
98
144
|
)
|
|
99
|
-
|
|
145
|
+
responses = [
|
|
146
|
+
DownloadStepResponse(
|
|
147
|
+
file_data_path=current_file_data_path, path=str(download_results["path"])
|
|
148
|
+
)
|
|
149
|
+
]
|
|
150
|
+
else:
|
|
100
151
|
# Supplemental results generated as part of the download process
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
152
|
+
for res in download_results:
|
|
153
|
+
file_data = res["file_data"]
|
|
154
|
+
file_data_path = self.persist_new_file_data(file_data=file_data)
|
|
155
|
+
download_path = res["path"]
|
|
156
|
+
self.update_file_data(
|
|
157
|
+
file_data=file_data,
|
|
158
|
+
file_data_path=Path(file_data_path),
|
|
159
|
+
download_path=download_path,
|
|
160
|
+
)
|
|
161
|
+
responses.append(
|
|
162
|
+
DownloadStepResponse(file_data_path=file_data_path, path=res["path"])
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return responses
|
|
108
166
|
|
|
109
167
|
def persist_new_file_data(self, file_data: FileData) -> str:
|
|
110
168
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
@@ -116,9 +174,17 @@ class DownloadStep(PipelineStep):
|
|
|
116
174
|
return str(filepath)
|
|
117
175
|
|
|
118
176
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
119
|
-
|
|
120
|
-
|
|
177
|
+
download_config_dict = json.loads(
|
|
178
|
+
serialize_base_model_json(model=self.process.download_config)
|
|
179
|
+
)
|
|
180
|
+
connection_config_dict = json.loads(
|
|
181
|
+
serialize_base_model_json(model=self.process.connection_config)
|
|
121
182
|
)
|
|
183
|
+
hashable_dict = {
|
|
184
|
+
"download_config": download_config_dict,
|
|
185
|
+
"connection_config": connection_config_dict,
|
|
186
|
+
}
|
|
187
|
+
hashable_string = json.dumps(hashable_dict, sort_keys=True)
|
|
122
188
|
if extras:
|
|
123
189
|
hashable_string += "".join(extras)
|
|
124
190
|
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
@@ -5,13 +5,11 @@ from dataclasses import dataclass
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Callable, Optional, TypedDict
|
|
7
7
|
|
|
8
|
-
from unstructured.staging.base import elements_to_dicts
|
|
9
|
-
|
|
10
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
11
9
|
from unstructured_ingest.v2.logger import logger
|
|
12
10
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
13
|
-
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
14
11
|
from unstructured_ingest.v2.processes.embedder import Embedder
|
|
12
|
+
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
15
13
|
|
|
16
14
|
STEP_ID = "embed"
|
|
17
15
|
|
|
@@ -30,11 +28,7 @@ class EmbedStep(PipelineStep):
|
|
|
30
28
|
return f"{self.identifier} ({self.process.config.embedding_provider})"
|
|
31
29
|
|
|
32
30
|
def __post_init__(self):
|
|
33
|
-
config = (
|
|
34
|
-
sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
|
|
35
|
-
if self.process.config
|
|
36
|
-
else None
|
|
37
|
-
)
|
|
31
|
+
config = self.process.config.json() if self.process.config else None
|
|
38
32
|
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
39
33
|
|
|
40
34
|
def should_embed(self, filepath: Path, file_data: FileData) -> bool:
|
|
@@ -71,13 +65,13 @@ class EmbedStep(PipelineStep):
|
|
|
71
65
|
|
|
72
66
|
self._save_output(
|
|
73
67
|
output_filepath=str(output_filepath),
|
|
74
|
-
embedded_content=
|
|
68
|
+
embedded_content=embed_content_raw,
|
|
75
69
|
)
|
|
76
70
|
return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
77
71
|
|
|
78
72
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
79
|
-
hashable_string =
|
|
80
|
-
self.process.config
|
|
73
|
+
hashable_string = serialize_base_model_json(
|
|
74
|
+
model=self.process.config, sort_keys=True, ensure_ascii=True
|
|
81
75
|
)
|
|
82
76
|
if extras:
|
|
83
77
|
hashable_string += "".join(extras)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Callable, Optional
|
|
4
|
+
|
|
5
|
+
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
6
|
+
from unstructured_ingest.v2.logger import logger
|
|
7
|
+
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
8
|
+
from unstructured_ingest.v2.processes.filter import Filterer
|
|
9
|
+
|
|
10
|
+
STEP_ID = "filter"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class FilterStep(PipelineStep):
|
|
15
|
+
process: Filterer
|
|
16
|
+
identifier: str = STEP_ID
|
|
17
|
+
|
|
18
|
+
def __post_init__(self):
|
|
19
|
+
config = self.process.config.json() if self.process.config else None
|
|
20
|
+
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
21
|
+
|
|
22
|
+
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
23
|
+
file_data = FileData.from_file(path=file_data_path)
|
|
24
|
+
fn_kwargs = {"file_data": file_data}
|
|
25
|
+
if not asyncio.iscoroutinefunction(fn):
|
|
26
|
+
resp = fn(**fn_kwargs)
|
|
27
|
+
elif semaphore := self.context.semaphore:
|
|
28
|
+
async with semaphore:
|
|
29
|
+
resp = await fn(**fn_kwargs)
|
|
30
|
+
else:
|
|
31
|
+
resp = await fn(**fn_kwargs)
|
|
32
|
+
|
|
33
|
+
if resp:
|
|
34
|
+
return {"file_data_path": file_data_path}
|
|
35
|
+
return None
|
|
@@ -6,7 +6,7 @@ from typing import Generator, Optional, TypeVar
|
|
|
6
6
|
from unstructured_ingest.v2.interfaces.indexer import Indexer
|
|
7
7
|
from unstructured_ingest.v2.logger import logger
|
|
8
8
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
9
|
-
from unstructured_ingest.v2.
|
|
9
|
+
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
10
10
|
|
|
11
11
|
IndexerT = TypeVar("IndexerT", bound=Indexer)
|
|
12
12
|
|
|
@@ -22,15 +22,9 @@ class IndexStep(PipelineStep):
|
|
|
22
22
|
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
23
23
|
|
|
24
24
|
def __post_init__(self):
|
|
25
|
-
config = (
|
|
26
|
-
sterilize_dict(self.process.index_config.to_dict(redact_sensitive=True))
|
|
27
|
-
if self.process.index_config
|
|
28
|
-
else None
|
|
29
|
-
)
|
|
25
|
+
config = self.process.index_config.json() if self.process.index_config else None
|
|
30
26
|
connection_config = (
|
|
31
|
-
|
|
32
|
-
if self.process.connection_config
|
|
33
|
-
else None
|
|
27
|
+
self.process.connection_config.json() if self.process.connection_config else None
|
|
34
28
|
)
|
|
35
29
|
logger.info(
|
|
36
30
|
f"Created {self.identifier} with configs: {config}, "
|
|
@@ -55,7 +49,17 @@ class IndexStep(PipelineStep):
|
|
|
55
49
|
continue
|
|
56
50
|
|
|
57
51
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
58
|
-
|
|
52
|
+
index_config_dict = json.loads(
|
|
53
|
+
serialize_base_model_json(model=self.process.index_config, sort_keys=True)
|
|
54
|
+
)
|
|
55
|
+
connection_config_dict = json.loads(
|
|
56
|
+
serialize_base_model_json(model=self.process.connection_config, sort_keys=True)
|
|
57
|
+
)
|
|
58
|
+
hashable_dict = {
|
|
59
|
+
"index_config": index_config_dict,
|
|
60
|
+
"connection_config": connection_config_dict,
|
|
61
|
+
}
|
|
62
|
+
hashable_string = json.dumps(hashable_dict, sort_keys=True)
|
|
59
63
|
if extras:
|
|
60
64
|
hashable_string += "".join(extras)
|
|
61
65
|
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
@@ -8,8 +8,8 @@ from typing import Callable, Optional, TypedDict
|
|
|
8
8
|
from unstructured_ingest.v2.interfaces import FileData
|
|
9
9
|
from unstructured_ingest.v2.logger import logger
|
|
10
10
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
11
|
-
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
12
11
|
from unstructured_ingest.v2.processes.partitioner import Partitioner
|
|
12
|
+
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
13
13
|
|
|
14
14
|
STEP_ID = "partition"
|
|
15
15
|
|
|
@@ -28,7 +28,7 @@ class PartitionStep(PipelineStep):
|
|
|
28
28
|
return f"{self.identifier} ({self.process.config.strategy})"
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
|
-
config =
|
|
31
|
+
config = self.process.config.json()
|
|
32
32
|
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_partition(self, filepath: Path, file_data: FileData) -> bool:
|
|
@@ -56,7 +56,7 @@ class PartitionStep(PipelineStep):
|
|
|
56
56
|
if not self.should_partition(filepath=output_filepath, file_data=file_data):
|
|
57
57
|
logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
|
|
58
58
|
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
59
|
-
fn_kwargs = {"filename": path, "metadata": file_data.metadata}
|
|
59
|
+
fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
|
|
60
60
|
if not asyncio.iscoroutinefunction(fn):
|
|
61
61
|
partitioned_content = fn(**fn_kwargs)
|
|
62
62
|
elif semaphore := self.context.semaphore:
|
|
@@ -70,8 +70,8 @@ class PartitionStep(PipelineStep):
|
|
|
70
70
|
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
71
71
|
|
|
72
72
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
73
|
-
hashable_string =
|
|
74
|
-
self.process.config
|
|
73
|
+
hashable_string = serialize_base_model_json(
|
|
74
|
+
model=self.process.config, sort_keys=True, ensure_ascii=True
|
|
75
75
|
)
|
|
76
76
|
if extras:
|
|
77
77
|
hashable_string += "".join(extras)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
|
-
import json
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import Callable, Optional, TypedDict
|
|
@@ -9,7 +8,7 @@ from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
|
9
8
|
from unstructured_ingest.v2.interfaces.upload_stager import UploadStager
|
|
10
9
|
from unstructured_ingest.v2.logger import logger
|
|
11
10
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
12
|
-
from unstructured_ingest.v2.
|
|
11
|
+
from unstructured_ingest.v2.utils import serialize_base_model_json
|
|
13
12
|
|
|
14
13
|
STEP_ID = "upload_stage"
|
|
15
14
|
|
|
@@ -29,9 +28,7 @@ class UploadStageStep(PipelineStep):
|
|
|
29
28
|
|
|
30
29
|
def __post_init__(self):
|
|
31
30
|
config = (
|
|
32
|
-
|
|
33
|
-
if self.process.upload_stager_config
|
|
34
|
-
else None
|
|
31
|
+
self.process.upload_stager_config.json() if self.process.upload_stager_config else None
|
|
35
32
|
)
|
|
36
33
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
37
34
|
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
@@ -56,8 +53,8 @@ class UploadStageStep(PipelineStep):
|
|
|
56
53
|
return UploadStageStepResponse(file_data_path=file_data_path, path=str(staged_output_path))
|
|
57
54
|
|
|
58
55
|
def get_hash(self, extras: Optional[list[str]]) -> str:
|
|
59
|
-
hashable_string =
|
|
60
|
-
self.process.upload_stager_config
|
|
56
|
+
hashable_string = serialize_base_model_json(
|
|
57
|
+
model=self.process.upload_stager_config, sort_keys=True, ensure_ascii=True
|
|
61
58
|
)
|
|
62
59
|
if extras:
|
|
63
60
|
hashable_string += "".join(extras)
|
|
@@ -5,7 +5,6 @@ from typing import Callable, TypedDict
|
|
|
5
5
|
from unstructured_ingest.v2.interfaces.file_data import FileData
|
|
6
6
|
from unstructured_ingest.v2.logger import logger
|
|
7
7
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep
|
|
8
|
-
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
9
8
|
from unstructured_ingest.v2.processes.uncompress import Uncompressor
|
|
10
9
|
|
|
11
10
|
STEP_ID = "uncompress"
|
|
@@ -21,11 +20,7 @@ class UncompressStep(PipelineStep):
|
|
|
21
20
|
identifier: str = STEP_ID
|
|
22
21
|
|
|
23
22
|
def __post_init__(self):
|
|
24
|
-
config = (
|
|
25
|
-
sterilize_dict(self.process.config.to_dict(redact_sensitive=True))
|
|
26
|
-
if self.process.config
|
|
27
|
-
else None
|
|
28
|
-
)
|
|
23
|
+
config = self.process.config.json() if self.process.config else None
|
|
29
24
|
logger.info(f"Created {self.identifier} with configs: {config}")
|
|
30
25
|
|
|
31
26
|
def _run(self, path: str, file_data_path: str) -> list[UncompressStepResponse]:
|
|
@@ -7,7 +7,6 @@ from unstructured_ingest.v2.interfaces import FileData
|
|
|
7
7
|
from unstructured_ingest.v2.interfaces.uploader import UploadContent, Uploader
|
|
8
8
|
from unstructured_ingest.v2.logger import logger
|
|
9
9
|
from unstructured_ingest.v2.pipeline.interfaces import PipelineStep, iterable_input, timed
|
|
10
|
-
from unstructured_ingest.v2.pipeline.utils import sterilize_dict
|
|
11
10
|
|
|
12
11
|
STEP_ID = "upload"
|
|
13
12
|
|
|
@@ -26,15 +25,9 @@ class UploadStep(PipelineStep):
|
|
|
26
25
|
return f"{self.identifier} ({self.process.__class__.__name__})"
|
|
27
26
|
|
|
28
27
|
def __post_init__(self):
|
|
29
|
-
config = (
|
|
30
|
-
sterilize_dict(self.process.upload_config.to_dict(redact_sensitive=True))
|
|
31
|
-
if self.process.upload_config
|
|
32
|
-
else None
|
|
33
|
-
)
|
|
28
|
+
config = self.process.upload_config.json() if self.process.upload_config else None
|
|
34
29
|
connection_config = (
|
|
35
|
-
|
|
36
|
-
if self.process.connection_config
|
|
37
|
-
else None
|
|
30
|
+
self.process.connection_config.json() if self.process.connection_config else None
|
|
38
31
|
)
|
|
39
32
|
logger.info(
|
|
40
33
|
f"Created {self.identifier} with configs: {config}, "
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from .chunker import Chunker, ChunkerConfig
|
|
2
|
+
from .embedder import Embedder, EmbedderConfig
|
|
3
|
+
from .filter import Filterer, FiltererConfig
|
|
4
|
+
from .partitioner import Partitioner, PartitionerConfig
|
|
5
|
+
from .uncompress import UncompressConfig, Uncompressor
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Chunker",
|
|
9
|
+
"ChunkerConfig",
|
|
10
|
+
"Embedder",
|
|
11
|
+
"EmbedderConfig",
|
|
12
|
+
"Filterer",
|
|
13
|
+
"FiltererConfig",
|
|
14
|
+
"Partitioner",
|
|
15
|
+
"PartitionerConfig",
|
|
16
|
+
"Uncompressor",
|
|
17
|
+
"UncompressConfig",
|
|
18
|
+
]
|