unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of unstructured-ingest might be problematic. Click here for more details.
- unstructured_ingest/__version__.py +1 -1
- unstructured_ingest/cli/interfaces.py +1 -1
- unstructured_ingest/cli/utils.py +1 -1
- unstructured_ingest/connector/astradb.py +1 -1
- unstructured_ingest/connector/biomed.py +4 -4
- unstructured_ingest/connector/chroma.py +1 -1
- unstructured_ingest/connector/databricks_volumes.py +2 -2
- unstructured_ingest/connector/fsspec/box.py +1 -1
- unstructured_ingest/connector/fsspec/fsspec.py +5 -5
- unstructured_ingest/connector/git.py +1 -1
- unstructured_ingest/connector/google_drive.py +4 -4
- unstructured_ingest/connector/hubspot.py +1 -1
- unstructured_ingest/connector/kafka.py +8 -8
- unstructured_ingest/connector/local.py +1 -1
- unstructured_ingest/connector/notion/helpers.py +4 -4
- unstructured_ingest/connector/onedrive.py +3 -3
- unstructured_ingest/connector/outlook.py +2 -2
- unstructured_ingest/connector/pinecone.py +1 -1
- unstructured_ingest/connector/sharepoint.py +8 -8
- unstructured_ingest/connector/vectara.py +6 -6
- unstructured_ingest/embed/__init__.py +17 -0
- unstructured_ingest/embed/bedrock.py +70 -0
- unstructured_ingest/embed/huggingface.py +73 -0
- unstructured_ingest/embed/interfaces.py +36 -0
- unstructured_ingest/embed/mixedbreadai.py +177 -0
- unstructured_ingest/embed/octoai.py +63 -0
- unstructured_ingest/embed/openai.py +61 -0
- unstructured_ingest/embed/vertexai.py +88 -0
- unstructured_ingest/embed/voyageai.py +69 -0
- unstructured_ingest/interfaces.py +21 -11
- unstructured_ingest/logger.py +1 -1
- unstructured_ingest/pipeline/copy.py +1 -1
- unstructured_ingest/pipeline/interfaces.py +2 -2
- unstructured_ingest/pipeline/partition.py +1 -1
- unstructured_ingest/pipeline/pipeline.py +1 -1
- unstructured_ingest/pipeline/reformat/chunking.py +2 -2
- unstructured_ingest/pipeline/reformat/embedding.py +4 -6
- unstructured_ingest/pipeline/source.py +2 -2
- unstructured_ingest/utils/compression.py +3 -3
- unstructured_ingest/utils/data_prep.py +20 -12
- unstructured_ingest/utils/string_and_date_utils.py +2 -2
- unstructured_ingest/v2/cli/base/cmd.py +3 -3
- unstructured_ingest/v2/cli/base/dest.py +1 -1
- unstructured_ingest/v2/cli/base/src.py +3 -2
- unstructured_ingest/v2/cli/utils/click.py +1 -1
- unstructured_ingest/v2/interfaces/processor.py +48 -13
- unstructured_ingest/v2/logger.py +1 -1
- unstructured_ingest/v2/otel.py +1 -1
- unstructured_ingest/v2/pipeline/interfaces.py +12 -3
- unstructured_ingest/v2/pipeline/pipeline.py +42 -29
- unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
- unstructured_ingest/v2/pipeline/steps/download.py +17 -2
- unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
- unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
- unstructured_ingest/v2/pipeline/steps/index.py +2 -2
- unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
- unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
- unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
- unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
- unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
- unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
- unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
- unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
- unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
- unstructured_ingest/v2/processes/connectors/local.py +6 -5
- unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
- unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
- unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
- unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
- unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
- unstructured_ingest/v2/processes/embedder.py +41 -24
- unstructured_ingest/v2/processes/filter.py +1 -1
- unstructured_ingest/v2/processes/partitioner.py +3 -3
- unstructured_ingest/v2/utils.py +7 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
- unstructured_ingest/evaluate.py +0 -338
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
- {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0
|
@@ -27,8 +27,6 @@ class Embedder(ReformatNode):
|
|
|
27
27
|
return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
|
|
28
28
|
|
|
29
29
|
def run(self, elements_json: str) -> Optional[str]:
|
|
30
|
-
from unstructured.staging.base import elements_from_json
|
|
31
|
-
|
|
32
30
|
try:
|
|
33
31
|
elements_json_filename = os.path.basename(elements_json)
|
|
34
32
|
filename_ext = os.path.basename(elements_json_filename)
|
|
@@ -46,12 +44,12 @@ class Embedder(ReformatNode):
|
|
|
46
44
|
and json_path.is_file()
|
|
47
45
|
and json_path.stat().st_size
|
|
48
46
|
):
|
|
49
|
-
logger.debug(f"
|
|
47
|
+
logger.debug(f"file exists: {json_path}, skipping embedding")
|
|
50
48
|
return str(json_path)
|
|
51
|
-
|
|
49
|
+
with open(elements_json) as f:
|
|
50
|
+
elements = json.load(f)
|
|
52
51
|
embedder = self.embedder_config.get_embedder()
|
|
53
|
-
|
|
54
|
-
element_dicts = [e.to_dict() for e in embedded_elements]
|
|
52
|
+
element_dicts = embedder.embed_documents(elements=elements)
|
|
55
53
|
with open(json_path, "w", encoding="utf8") as output_f:
|
|
56
54
|
logger.info(f"writing embeddings content to {json_path}")
|
|
57
55
|
json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
|
|
@@ -24,12 +24,12 @@ class Reader(SourceNode):
|
|
|
24
24
|
and doc.filename.is_file()
|
|
25
25
|
and doc.filename.stat().st_size
|
|
26
26
|
):
|
|
27
|
-
logger.info(f"
|
|
27
|
+
logger.info(f"file exists: {doc.filename}, skipping download")
|
|
28
28
|
# Still need to fetch metadata if file exists locally
|
|
29
29
|
doc.update_source_metadata()
|
|
30
30
|
else:
|
|
31
31
|
serialized_doc = doc.to_json(redact_sensitive=True)
|
|
32
|
-
logger.debug(f"
|
|
32
|
+
logger.debug(f"fetching {serialized_doc} - PID: {os.getpid()}")
|
|
33
33
|
if self.retry_strategy:
|
|
34
34
|
self.retry_strategy(doc.get_file)
|
|
35
35
|
else:
|
|
@@ -22,7 +22,7 @@ TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
|
|
|
22
22
|
|
|
23
23
|
def uncompress_file(filename: str, path: Optional[str] = None) -> str:
|
|
24
24
|
"""
|
|
25
|
-
Takes in a compressed zip or tar file and
|
|
25
|
+
Takes in a compressed zip or tar file and decompresses it
|
|
26
26
|
"""
|
|
27
27
|
# Create path if it doesn't already exist
|
|
28
28
|
if path:
|
|
@@ -65,7 +65,7 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
|
|
|
65
65
|
logger.info(f"extracting tar {tar_filename} -> {path}")
|
|
66
66
|
# NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
|
|
67
67
|
with tarfile.open(tar_filename, "r:*") as tfile:
|
|
68
|
-
# NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
|
|
68
|
+
# NOTE(robinson): Mitigate against malicious content being extracted from the tar file.
|
|
69
69
|
# This was added in Python 3.12
|
|
70
70
|
# Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
|
|
71
71
|
if sys.version_info >= (3, 12):
|
|
@@ -113,6 +113,6 @@ class CompressionSourceConnectorMixin:
|
|
|
113
113
|
read_config=new_read_configs,
|
|
114
114
|
processor_config=new_process_configs,
|
|
115
115
|
)
|
|
116
|
-
logger.info(f"
|
|
116
|
+
logger.info(f"created local source connector: {local_connector.to_json()}")
|
|
117
117
|
local_connector.initialize()
|
|
118
118
|
return local_connector.get_ingest_docs()
|
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import json
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Any, Optional, Sequence, cast
|
|
4
|
+
from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
|
|
5
5
|
|
|
6
6
|
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
|
|
7
7
|
|
|
8
|
+
T = TypeVar("T")
|
|
9
|
+
IterableT = Iterable[T]
|
|
8
10
|
|
|
9
|
-
|
|
11
|
+
|
|
12
|
+
def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
|
|
10
13
|
"""A helper function to break an iterable into batches of size batch_size."""
|
|
11
14
|
it = iter(iterable)
|
|
12
15
|
chunk = tuple(itertools.islice(it, batch_size))
|
|
@@ -16,23 +19,28 @@ def batch_generator(iterable, batch_size=100):
|
|
|
16
19
|
|
|
17
20
|
|
|
18
21
|
def generator_batching_wbytes(
|
|
19
|
-
iterable
|
|
20
|
-
|
|
22
|
+
iterable: IterableT,
|
|
23
|
+
batch_size_limit_bytes: Optional[int] = None,
|
|
24
|
+
max_batch_size: Optional[int] = None,
|
|
25
|
+
) -> IterableT:
|
|
26
|
+
if not batch_size_limit_bytes and not max_batch_size:
|
|
27
|
+
return iterable
|
|
21
28
|
"""A helper function to break an iterable into chunks of specified bytes."""
|
|
22
29
|
current_batch, current_batch_size = [], 0
|
|
23
30
|
|
|
24
31
|
for item in iterable:
|
|
25
32
|
item_size_bytes = len(json.dumps(item).encode("utf-8"))
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
current_batch_size
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
current_batch.append(item)
|
|
32
|
-
current_batch_size += item_size_bytes
|
|
33
|
-
else:
|
|
33
|
+
if batch_size_limit_bytes and current_batch_size + item_size_bytes > batch_size_limit_bytes:
|
|
34
|
+
yield current_batch
|
|
35
|
+
current_batch, current_batch_size = [item], item_size_bytes
|
|
36
|
+
continue
|
|
37
|
+
if max_batch_size and len(current_batch) + 1 > max_batch_size:
|
|
34
38
|
yield current_batch
|
|
35
39
|
current_batch, current_batch_size = [item], item_size_bytes
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
current_batch.append(item)
|
|
43
|
+
current_batch_size += item_size_bytes
|
|
36
44
|
|
|
37
45
|
if current_batch:
|
|
38
46
|
yield current_batch
|
|
@@ -10,13 +10,13 @@ def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
|
|
|
10
10
|
try:
|
|
11
11
|
return json.loads(json_string)
|
|
12
12
|
except json.JSONDecodeError:
|
|
13
|
-
# Not
|
|
13
|
+
# Not necessary an error if it is a path or malformed json
|
|
14
14
|
pass
|
|
15
15
|
try:
|
|
16
16
|
# This is common when single quotes are used instead of double quotes
|
|
17
17
|
return json.loads(json_string.replace("'", '"'))
|
|
18
18
|
except json.JSONDecodeError:
|
|
19
|
-
# Not
|
|
19
|
+
# Not necessary an error if it is a path
|
|
20
20
|
pass
|
|
21
21
|
return json_string
|
|
22
22
|
|
|
@@ -102,7 +102,7 @@ class BaseCmd(ABC):
|
|
|
102
102
|
cmd.params.extend(options)
|
|
103
103
|
return cmd
|
|
104
104
|
|
|
105
|
-
def
|
|
105
|
+
def get_pipeline(
|
|
106
106
|
self,
|
|
107
107
|
src: str,
|
|
108
108
|
source_options: dict[str, Any],
|
|
@@ -122,7 +122,7 @@ class BaseCmd(ABC):
|
|
|
122
122
|
pipeline_kwargs["chunker"] = chunker
|
|
123
123
|
if filterer := self.get_filterer(options=source_options):
|
|
124
124
|
pipeline_kwargs["filterer"] = filterer
|
|
125
|
-
if embedder := self.
|
|
125
|
+
if embedder := self.get_embedder(options=source_options):
|
|
126
126
|
pipeline_kwargs["embedder"] = embedder
|
|
127
127
|
if dest:
|
|
128
128
|
logger.debug(
|
|
@@ -160,7 +160,7 @@ class BaseCmd(ABC):
|
|
|
160
160
|
return Filterer(config=filterer_configs)
|
|
161
161
|
|
|
162
162
|
@staticmethod
|
|
163
|
-
def
|
|
163
|
+
def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
|
|
164
164
|
embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
|
|
165
165
|
if not embedder_config.embedding_provider:
|
|
166
166
|
return None
|
|
@@ -40,7 +40,7 @@ class DestCmd(BaseCmd):
|
|
|
40
40
|
source_options: dict = ctx.parent.params if ctx.parent else {}
|
|
41
41
|
conform_click_options(options)
|
|
42
42
|
try:
|
|
43
|
-
pipeline = self.
|
|
43
|
+
pipeline = self.get_pipeline(
|
|
44
44
|
src=source_cmd,
|
|
45
45
|
source_options=source_options,
|
|
46
46
|
dest=self.cmd_name,
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
+
from typing import Any
|
|
3
4
|
|
|
4
5
|
import click
|
|
5
6
|
from pydantic import BaseModel
|
|
@@ -47,14 +48,14 @@ class SrcCmd(BaseCmd):
|
|
|
47
48
|
options = self.consolidate_options(options=options)
|
|
48
49
|
return options
|
|
49
50
|
|
|
50
|
-
def cmd(self, ctx: click.Context, **options) -> None:
|
|
51
|
+
def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
|
|
51
52
|
if ctx.invoked_subcommand:
|
|
52
53
|
return
|
|
53
54
|
|
|
54
55
|
conform_click_options(options)
|
|
55
56
|
logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
|
|
56
57
|
try:
|
|
57
|
-
pipeline = self.
|
|
58
|
+
pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
|
|
58
59
|
pipeline.run()
|
|
59
60
|
except Exception as e:
|
|
60
61
|
logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
|
|
@@ -11,21 +11,56 @@ DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pi
|
|
|
11
11
|
class ProcessorConfig(BaseModel):
|
|
12
12
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
13
13
|
|
|
14
|
-
reprocess: bool =
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
14
|
+
reprocess: bool = Field(
|
|
15
|
+
default=False,
|
|
16
|
+
description="Reprocess a downloaded file even if the relevant structured "
|
|
17
|
+
"output .json file in output directory already exists.",
|
|
18
|
+
)
|
|
19
|
+
verbose: bool = Field(default=False)
|
|
20
|
+
tqdm: bool = Field(default=False, description="Display tqdm progress bar")
|
|
21
|
+
work_dir: str = Field(
|
|
22
|
+
default_factory=lambda: DEFAULT_WORK_DIR,
|
|
23
|
+
description="Where to place working files when processing each step",
|
|
24
|
+
)
|
|
25
|
+
num_processes: int = Field(
|
|
26
|
+
default=2, description="Number of parallel processes with which to process docs"
|
|
27
|
+
)
|
|
28
|
+
max_connections: Optional[int] = Field(
|
|
29
|
+
default=None, description="Limit of concurrent connectionts"
|
|
30
|
+
)
|
|
31
|
+
raise_on_error: bool = Field(
|
|
32
|
+
default=False,
|
|
33
|
+
description="Is set, will raise error if any doc in the pipeline fail. "
|
|
34
|
+
"Otherwise will log error and continue with other docs",
|
|
35
|
+
)
|
|
21
36
|
disable_parallelism: bool = Field(
|
|
22
|
-
default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
|
|
37
|
+
default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true",
|
|
38
|
+
)
|
|
39
|
+
preserve_downloads: bool = Field(
|
|
40
|
+
default=False, description="Don't delete downloaded files after process completes"
|
|
41
|
+
)
|
|
42
|
+
download_only: bool = Field(
|
|
43
|
+
default=False, description="skip the rest of the process after files are downloaded"
|
|
44
|
+
)
|
|
45
|
+
re_download: bool = Field(
|
|
46
|
+
default=False,
|
|
47
|
+
description="If set, will re-download downloaded files "
|
|
48
|
+
"regardless of if they already exist locally",
|
|
49
|
+
)
|
|
50
|
+
uncompress: bool = Field(
|
|
51
|
+
default=False,
|
|
52
|
+
description="Uncompress any archived files. Currently supporting "
|
|
53
|
+
"zip and tar files based on file extension.",
|
|
54
|
+
)
|
|
55
|
+
iter_delete: bool = Field(
|
|
56
|
+
default=False,
|
|
57
|
+
description="If limited on memory, this can be enabled to delete "
|
|
58
|
+
"cached content as it's used and no longer needed in the pipeline.",
|
|
59
|
+
)
|
|
60
|
+
delete_cache: bool = Field(
|
|
61
|
+
default=False,
|
|
62
|
+
description="If set, will delete the cache work directory when process finishes",
|
|
23
63
|
)
|
|
24
|
-
preserve_downloads: bool = False
|
|
25
|
-
download_only: bool = False
|
|
26
|
-
max_docs: Optional[int] = None
|
|
27
|
-
re_download: bool = False
|
|
28
|
-
uncompress: bool = False
|
|
29
64
|
|
|
30
65
|
# OTEL support
|
|
31
66
|
otel_endpoint: Optional[str] = Field(
|
unstructured_ingest/v2/logger.py
CHANGED
|
@@ -101,7 +101,7 @@ class SensitiveFormatter(Formatter):
|
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
def remove_root_handlers(logger: Logger) -> None:
|
|
104
|
-
# NOTE(robinson)
|
|
104
|
+
# NOTE(robinson): in some environments such as Google Colab, there is a root handler
|
|
105
105
|
# that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
|
|
106
106
|
# Removing these when they exist prevents this behavior
|
|
107
107
|
if logger.root.hasHandlers():
|
unstructured_ingest/v2/otel.py
CHANGED
|
@@ -92,7 +92,7 @@ class OtelHandler:
|
|
|
92
92
|
return None
|
|
93
93
|
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
94
94
|
|
|
95
|
-
logger.debug(f"
|
|
95
|
+
logger.debug(f"adding otel exported at {otel_endpoint}")
|
|
96
96
|
trace_exporter = OTLPSpanExporter()
|
|
97
97
|
processor = SimpleSpanProcessor(trace_exporter)
|
|
98
98
|
provider.add_span_processor(processor)
|
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import asyncio
|
|
2
4
|
import logging
|
|
3
5
|
import multiprocessing as mp
|
|
6
|
+
import shutil
|
|
4
7
|
from abc import ABC, abstractmethod
|
|
5
8
|
from concurrent.futures import ThreadPoolExecutor
|
|
6
9
|
from dataclasses import dataclass
|
|
@@ -116,10 +119,10 @@ class PipelineStep(ABC):
|
|
|
116
119
|
iterable = iterable or []
|
|
117
120
|
if iterable:
|
|
118
121
|
logger.info(
|
|
119
|
-
f"
|
|
122
|
+
f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
|
|
120
123
|
)
|
|
121
124
|
else:
|
|
122
|
-
logger.info(f"
|
|
125
|
+
logger.info(f"calling {self.__class__.__name__} with no inputs")
|
|
123
126
|
if self.context.async_supported and self.process.is_async():
|
|
124
127
|
return self.process_async(iterable=iterable)
|
|
125
128
|
if self.context.mp_supported:
|
|
@@ -132,7 +135,7 @@ class PipelineStep(ABC):
|
|
|
132
135
|
async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
|
|
133
136
|
raise NotImplementedError
|
|
134
137
|
|
|
135
|
-
def run(self, _fn:
|
|
138
|
+
def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
|
|
136
139
|
kwargs = kwargs.copy()
|
|
137
140
|
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
|
|
138
141
|
tracer = otel_handler.get_tracer()
|
|
@@ -178,6 +181,12 @@ class PipelineStep(ABC):
|
|
|
178
181
|
def cache_dir(self) -> Path:
|
|
179
182
|
return Path(self.context.work_dir) / self.identifier
|
|
180
183
|
|
|
184
|
+
def delete_cache(self):
|
|
185
|
+
if self.context.iter_delete and self.cache_dir.exists():
|
|
186
|
+
cache_dir = self.cache_dir
|
|
187
|
+
logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
|
|
188
|
+
shutil.rmtree(cache_dir)
|
|
189
|
+
|
|
181
190
|
|
|
182
191
|
@dataclass
|
|
183
192
|
class BatchPipelineStep(PipelineStep, ABC):
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import multiprocessing as mp
|
|
5
|
+
import shutil
|
|
3
6
|
from dataclasses import InitVar, dataclass, field
|
|
4
|
-
from
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
5
9
|
|
|
6
10
|
from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
|
|
7
11
|
from unstructured_ingest.v2.logger import logger, make_default_logger
|
|
@@ -48,33 +52,33 @@ class Pipeline:
|
|
|
48
52
|
partitioner: InitVar[Partitioner]
|
|
49
53
|
partitioner_step: PartitionStep = field(init=False)
|
|
50
54
|
|
|
51
|
-
chunker: InitVar[
|
|
52
|
-
chunker_step: ChunkStep = field(init=False, default=None)
|
|
55
|
+
chunker: InitVar[Chunker | None] = None
|
|
56
|
+
chunker_step: ChunkStep | None = field(init=False, default=None)
|
|
53
57
|
|
|
54
|
-
embedder: InitVar[
|
|
55
|
-
embedder_step: EmbedStep = field(init=False, default=None)
|
|
58
|
+
embedder: InitVar[Embedder | None] = None
|
|
59
|
+
embedder_step: EmbedStep | None = field(init=False, default=None)
|
|
56
60
|
|
|
57
|
-
stager: InitVar[
|
|
58
|
-
stager_step: UploadStageStep = field(init=False, default=None)
|
|
61
|
+
stager: InitVar[UploadStager | None] = None
|
|
62
|
+
stager_step: UploadStageStep | None = field(init=False, default=None)
|
|
59
63
|
|
|
60
64
|
uploader: InitVar[Uploader] = field(default=LocalUploader())
|
|
61
|
-
uploader_step: UploadStep = field(init=False, default=None)
|
|
65
|
+
uploader_step: UploadStep | None = field(init=False, default=None)
|
|
62
66
|
|
|
63
|
-
uncompress_step: UncompressStep = field(init=False, default=None)
|
|
67
|
+
uncompress_step: UncompressStep | None = field(init=False, default=None)
|
|
64
68
|
|
|
65
|
-
filterer: InitVar[
|
|
66
|
-
filter_step: FilterStep = field(init=False, default=None)
|
|
69
|
+
filterer: InitVar[Filterer | None] = None
|
|
70
|
+
filter_step: FilterStep | None = field(init=False, default=None)
|
|
67
71
|
|
|
68
72
|
def __post_init__(
|
|
69
73
|
self,
|
|
70
74
|
indexer: IndexerT,
|
|
71
75
|
downloader: DownloaderT,
|
|
72
76
|
partitioner: Partitioner,
|
|
73
|
-
chunker: Chunker = None,
|
|
74
|
-
embedder: Embedder = None,
|
|
75
|
-
stager: UploadStager = None,
|
|
76
|
-
uploader: Uploader = None,
|
|
77
|
-
filterer: Filterer = None,
|
|
77
|
+
chunker: Chunker | None = None,
|
|
78
|
+
embedder: Embedder | None = None,
|
|
79
|
+
stager: UploadStager | None = None,
|
|
80
|
+
uploader: Uploader | None = None,
|
|
81
|
+
filterer: Filterer | None = None,
|
|
78
82
|
):
|
|
79
83
|
make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
|
|
80
84
|
otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
|
|
@@ -113,7 +117,9 @@ class Pipeline:
|
|
|
113
117
|
)
|
|
114
118
|
|
|
115
119
|
def cleanup(self):
|
|
116
|
-
|
|
120
|
+
if self.context.delete_cache and Path(self.context.work_dir).exists():
|
|
121
|
+
logger.info(f"deleting cache directory: {self.context.work_dir}")
|
|
122
|
+
shutil.rmtree(self.context.work_dir)
|
|
117
123
|
|
|
118
124
|
def log_statuses(self):
|
|
119
125
|
if status := self.context.status:
|
|
@@ -136,7 +142,7 @@ class Pipeline:
|
|
|
136
142
|
if self.context.status:
|
|
137
143
|
raise PipelineError("Pipeline did not run successfully")
|
|
138
144
|
|
|
139
|
-
def clean_results(self, results:
|
|
145
|
+
def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
|
|
140
146
|
if not results:
|
|
141
147
|
return None
|
|
142
148
|
results = [r for r in results if r]
|
|
@@ -181,7 +187,7 @@ class Pipeline:
|
|
|
181
187
|
return filtered_records
|
|
182
188
|
|
|
183
189
|
def _run(self):
|
|
184
|
-
logger.info(f"
|
|
190
|
+
logger.info(f"running local pipeline: {self} with configs: " f"{self.context.json()}")
|
|
185
191
|
if self.context.mp_supported:
|
|
186
192
|
manager = mp.Manager()
|
|
187
193
|
self.context.status = manager.dict()
|
|
@@ -226,26 +232,33 @@ class Pipeline:
|
|
|
226
232
|
logger.info("No files to process after filtering uncompressed content, exiting")
|
|
227
233
|
return
|
|
228
234
|
|
|
229
|
-
if not downloaded_data:
|
|
235
|
+
if not downloaded_data or self.context.download_only:
|
|
230
236
|
return
|
|
231
237
|
|
|
232
238
|
# Partition content
|
|
233
239
|
elements = self.partitioner_step(downloaded_data)
|
|
240
|
+
# Download data non longer needed, delete if possible
|
|
241
|
+
self.downloader_step.delete_cache()
|
|
234
242
|
elements = self.clean_results(results=elements)
|
|
235
243
|
if not elements:
|
|
236
244
|
logger.info("No files to process after partitioning, exiting")
|
|
237
245
|
return
|
|
238
246
|
|
|
239
247
|
# Run element specific modifiers
|
|
240
|
-
|
|
241
|
-
|
|
248
|
+
last_step = self.partitioner_step
|
|
249
|
+
for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
|
|
250
|
+
elements = step(elements)
|
|
242
251
|
elements = self.clean_results(results=elements)
|
|
252
|
+
# Delete data from previous step if possible since no longer needed
|
|
253
|
+
last_step.delete_cache()
|
|
254
|
+
last_step = step
|
|
243
255
|
if not elements:
|
|
244
|
-
logger.info(f"
|
|
256
|
+
logger.info(f"no files to process after {step.__class__.__name__}, exiting")
|
|
245
257
|
return
|
|
246
258
|
|
|
247
259
|
# Upload the final result
|
|
248
260
|
self.uploader_step(iterable=elements)
|
|
261
|
+
last_step.delete_cache()
|
|
249
262
|
|
|
250
263
|
def __str__(self):
|
|
251
264
|
s = [str(self.indexer_step)]
|
|
@@ -274,12 +287,12 @@ class Pipeline:
|
|
|
274
287
|
downloader_config: DownloaderConfigT,
|
|
275
288
|
source_connection_config: ConnectionConfig,
|
|
276
289
|
partitioner_config: PartitionerConfig,
|
|
277
|
-
filterer_config: FiltererConfig = None,
|
|
278
|
-
chunker_config:
|
|
279
|
-
embedder_config:
|
|
280
|
-
destination_connection_config:
|
|
281
|
-
stager_config:
|
|
282
|
-
uploader_config:
|
|
290
|
+
filterer_config: FiltererConfig | None = None,
|
|
291
|
+
chunker_config: ChunkerConfig | None = None,
|
|
292
|
+
embedder_config: EmbedderConfig | None = None,
|
|
293
|
+
destination_connection_config: ConnectionConfig | None = None,
|
|
294
|
+
stager_config: UploadStagerConfigT | None = None,
|
|
295
|
+
uploader_config: UploaderConfigT | None = None,
|
|
283
296
|
) -> "Pipeline":
|
|
284
297
|
# Get registry key based on indexer config
|
|
285
298
|
source_entry = {
|
|
@@ -29,7 +29,7 @@ class ChunkStep(PipelineStep):
|
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
31
|
config = self.process.config.json() if self.process.config else None
|
|
32
|
-
logger.info(f"
|
|
32
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
|
|
35
35
|
if self.context.reprocess or file_data.reprocess:
|
|
@@ -44,7 +44,7 @@ class ChunkStep(PipelineStep):
|
|
|
44
44
|
|
|
45
45
|
def _save_output(self, output_filepath: str, chunked_content: list[dict]):
|
|
46
46
|
with open(str(output_filepath), "w") as f:
|
|
47
|
-
logger.debug(f"
|
|
47
|
+
logger.debug(f"writing chunker output to: {output_filepath}")
|
|
48
48
|
json.dump(chunked_content, f, indent=2)
|
|
49
49
|
|
|
50
50
|
async def _run_async(
|
|
@@ -54,7 +54,7 @@ class ChunkStep(PipelineStep):
|
|
|
54
54
|
file_data = FileData.from_file(path=file_data_path)
|
|
55
55
|
output_filepath = self.get_output_filepath(filename=path)
|
|
56
56
|
if not self.should_chunk(filepath=output_filepath, file_data=file_data):
|
|
57
|
-
logger.debug(f"
|
|
57
|
+
logger.debug(f"skipping chunking, output already exists: {output_filepath}")
|
|
58
58
|
return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
59
59
|
fn_kwargs = {"elements_filepath": path}
|
|
60
60
|
if not asyncio.iscoroutinefunction(fn):
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import hashlib
|
|
3
3
|
import json
|
|
4
|
+
import shutil
|
|
4
5
|
from dataclasses import dataclass
|
|
5
6
|
from pathlib import Path
|
|
6
7
|
from typing import Callable, Optional, TypedDict, TypeVar
|
|
@@ -82,7 +83,7 @@ class DownloadStep(PipelineStep):
|
|
|
82
83
|
f"match size of local file: {file_size_bytes}, updating"
|
|
83
84
|
)
|
|
84
85
|
file_data.metadata.filesize_bytes = file_size_bytes
|
|
85
|
-
logger.debug(f"
|
|
86
|
+
logger.debug(f"updating file data with new content: {file_data.to_dict()}")
|
|
86
87
|
with file_data_path.open("w") as file:
|
|
87
88
|
json.dump(file_data.to_dict(), file, indent=2)
|
|
88
89
|
|
|
@@ -90,7 +91,7 @@ class DownloadStep(PipelineStep):
|
|
|
90
91
|
file_data = FileData.from_file(path=file_data_path)
|
|
91
92
|
download_path = self.process.get_download_path(file_data=file_data)
|
|
92
93
|
if not self.should_download(file_data=file_data, file_data_path=file_data_path):
|
|
93
|
-
logger.debug(f"
|
|
94
|
+
logger.debug(f"skipping download, file already exists locally: {download_path}")
|
|
94
95
|
self.update_file_data(
|
|
95
96
|
file_data=file_data,
|
|
96
97
|
file_data_path=Path(file_data_path),
|
|
@@ -185,3 +186,17 @@ class DownloadStep(PipelineStep):
|
|
|
185
186
|
if extras:
|
|
186
187
|
hashable_string += "".join(extras)
|
|
187
188
|
return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
|
|
189
|
+
|
|
190
|
+
@property
|
|
191
|
+
def cache_dir(self) -> Path:
|
|
192
|
+
return self.process.download_config.download_dir
|
|
193
|
+
|
|
194
|
+
def delete_cache(self):
|
|
195
|
+
if (
|
|
196
|
+
self.context.iter_delete
|
|
197
|
+
and not self.context.preserve_downloads
|
|
198
|
+
and self.cache_dir.exists()
|
|
199
|
+
):
|
|
200
|
+
cache_dir = self.cache_dir
|
|
201
|
+
logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
|
|
202
|
+
shutil.rmtree(cache_dir)
|
|
@@ -29,7 +29,7 @@ class EmbedStep(PipelineStep):
|
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
31
|
config = self.process.config.json() if self.process.config else None
|
|
32
|
-
logger.info(f"
|
|
32
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_embed(self, filepath: Path, file_data: FileData) -> bool:
|
|
35
35
|
if self.context.reprocess or file_data.reprocess:
|
|
@@ -44,7 +44,7 @@ class EmbedStep(PipelineStep):
|
|
|
44
44
|
|
|
45
45
|
def _save_output(self, output_filepath: str, embedded_content: list[dict]):
|
|
46
46
|
with open(str(output_filepath), "w") as f:
|
|
47
|
-
logger.debug(f"
|
|
47
|
+
logger.debug(f"writing embedded output to: {output_filepath}")
|
|
48
48
|
json.dump(embedded_content, f, indent=2)
|
|
49
49
|
|
|
50
50
|
async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
|
|
@@ -52,7 +52,7 @@ class EmbedStep(PipelineStep):
|
|
|
52
52
|
file_data = FileData.from_file(path=file_data_path)
|
|
53
53
|
output_filepath = self.get_output_filepath(filename=path)
|
|
54
54
|
if not self.should_embed(filepath=output_filepath, file_data=file_data):
|
|
55
|
-
logger.debug(f"
|
|
55
|
+
logger.debug(f"skipping embedding, output already exists: {output_filepath}")
|
|
56
56
|
return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
57
57
|
fn_kwargs = {"elements_filepath": path}
|
|
58
58
|
if not asyncio.iscoroutinefunction(fn):
|
|
@@ -17,7 +17,7 @@ class FilterStep(PipelineStep):
|
|
|
17
17
|
|
|
18
18
|
def __post_init__(self):
|
|
19
19
|
config = self.process.config.json() if self.process.config else None
|
|
20
|
-
logger.info(f"
|
|
20
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
21
21
|
|
|
22
22
|
async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
|
|
23
23
|
file_data = FileData.from_file(path=file_data_path)
|
|
@@ -28,14 +28,14 @@ class IndexStep(PipelineStep):
|
|
|
28
28
|
self.process.connection_config.json() if self.process.connection_config else None
|
|
29
29
|
)
|
|
30
30
|
logger.info(
|
|
31
|
-
f"
|
|
31
|
+
f"created {self.identifier} with configs: {config}, "
|
|
32
32
|
f"connection configs: {connection_config}"
|
|
33
33
|
)
|
|
34
34
|
|
|
35
35
|
@instrument(span_name=STEP_ID)
|
|
36
36
|
def run(self) -> Generator[str, None, None]:
|
|
37
37
|
for file_data in self.process.run():
|
|
38
|
-
logger.debug(f"
|
|
38
|
+
logger.debug(f"generated file data: {file_data.to_dict()}")
|
|
39
39
|
try:
|
|
40
40
|
record_hash = self.get_hash(extras=[file_data.identifier])
|
|
41
41
|
filename = f"{record_hash}.json"
|
|
@@ -29,7 +29,7 @@ class PartitionStep(PipelineStep):
|
|
|
29
29
|
|
|
30
30
|
def __post_init__(self):
|
|
31
31
|
config = self.process.config.json()
|
|
32
|
-
logger.info(f"
|
|
32
|
+
logger.info(f"created {self.identifier} with configs: {config}")
|
|
33
33
|
|
|
34
34
|
def should_partition(self, filepath: Path, file_data: FileData) -> bool:
|
|
35
35
|
if self.context.reprocess or file_data.reprocess:
|
|
@@ -44,7 +44,7 @@ class PartitionStep(PipelineStep):
|
|
|
44
44
|
|
|
45
45
|
def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
|
|
46
46
|
with open(str(output_filepath), "w") as f:
|
|
47
|
-
logger.debug(f"
|
|
47
|
+
logger.debug(f"writing partitioned output to: {output_filepath}")
|
|
48
48
|
json.dump(partitioned_content, f, indent=2)
|
|
49
49
|
|
|
50
50
|
async def _run_async(
|
|
@@ -54,7 +54,7 @@ class PartitionStep(PipelineStep):
|
|
|
54
54
|
file_data = FileData.from_file(path=file_data_path)
|
|
55
55
|
output_filepath = self.get_output_filepath(filename=Path(file_data_path))
|
|
56
56
|
if not self.should_partition(filepath=output_filepath, file_data=file_data):
|
|
57
|
-
logger.debug(f"
|
|
57
|
+
logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
|
|
58
58
|
return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
|
|
59
59
|
fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
|
|
60
60
|
if not asyncio.iscoroutinefunction(fn):
|