unstructured-ingest 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of unstructured-ingest might be problematic. Click here for more details.

Files changed (82) hide show
  1. unstructured_ingest/__version__.py +1 -1
  2. unstructured_ingest/cli/interfaces.py +1 -1
  3. unstructured_ingest/cli/utils.py +1 -1
  4. unstructured_ingest/connector/astradb.py +1 -1
  5. unstructured_ingest/connector/biomed.py +4 -4
  6. unstructured_ingest/connector/chroma.py +1 -1
  7. unstructured_ingest/connector/databricks_volumes.py +2 -2
  8. unstructured_ingest/connector/fsspec/box.py +1 -1
  9. unstructured_ingest/connector/fsspec/fsspec.py +5 -5
  10. unstructured_ingest/connector/git.py +1 -1
  11. unstructured_ingest/connector/google_drive.py +4 -4
  12. unstructured_ingest/connector/hubspot.py +1 -1
  13. unstructured_ingest/connector/kafka.py +8 -8
  14. unstructured_ingest/connector/local.py +1 -1
  15. unstructured_ingest/connector/notion/helpers.py +4 -4
  16. unstructured_ingest/connector/onedrive.py +3 -3
  17. unstructured_ingest/connector/outlook.py +2 -2
  18. unstructured_ingest/connector/pinecone.py +1 -1
  19. unstructured_ingest/connector/sharepoint.py +8 -8
  20. unstructured_ingest/connector/vectara.py +6 -6
  21. unstructured_ingest/embed/__init__.py +17 -0
  22. unstructured_ingest/embed/bedrock.py +70 -0
  23. unstructured_ingest/embed/huggingface.py +73 -0
  24. unstructured_ingest/embed/interfaces.py +36 -0
  25. unstructured_ingest/embed/mixedbreadai.py +177 -0
  26. unstructured_ingest/embed/octoai.py +63 -0
  27. unstructured_ingest/embed/openai.py +61 -0
  28. unstructured_ingest/embed/vertexai.py +88 -0
  29. unstructured_ingest/embed/voyageai.py +69 -0
  30. unstructured_ingest/interfaces.py +21 -11
  31. unstructured_ingest/logger.py +1 -1
  32. unstructured_ingest/pipeline/copy.py +1 -1
  33. unstructured_ingest/pipeline/interfaces.py +2 -2
  34. unstructured_ingest/pipeline/partition.py +1 -1
  35. unstructured_ingest/pipeline/pipeline.py +1 -1
  36. unstructured_ingest/pipeline/reformat/chunking.py +2 -2
  37. unstructured_ingest/pipeline/reformat/embedding.py +4 -6
  38. unstructured_ingest/pipeline/source.py +2 -2
  39. unstructured_ingest/utils/compression.py +3 -3
  40. unstructured_ingest/utils/data_prep.py +20 -12
  41. unstructured_ingest/utils/string_and_date_utils.py +2 -2
  42. unstructured_ingest/v2/cli/base/cmd.py +3 -3
  43. unstructured_ingest/v2/cli/base/dest.py +1 -1
  44. unstructured_ingest/v2/cli/base/src.py +3 -2
  45. unstructured_ingest/v2/cli/utils/click.py +1 -1
  46. unstructured_ingest/v2/interfaces/processor.py +48 -13
  47. unstructured_ingest/v2/logger.py +1 -1
  48. unstructured_ingest/v2/otel.py +1 -1
  49. unstructured_ingest/v2/pipeline/interfaces.py +12 -3
  50. unstructured_ingest/v2/pipeline/pipeline.py +42 -29
  51. unstructured_ingest/v2/pipeline/steps/chunk.py +3 -3
  52. unstructured_ingest/v2/pipeline/steps/download.py +17 -2
  53. unstructured_ingest/v2/pipeline/steps/embed.py +3 -3
  54. unstructured_ingest/v2/pipeline/steps/filter.py +1 -1
  55. unstructured_ingest/v2/pipeline/steps/index.py +2 -2
  56. unstructured_ingest/v2/pipeline/steps/partition.py +3 -3
  57. unstructured_ingest/v2/pipeline/steps/stage.py +1 -1
  58. unstructured_ingest/v2/pipeline/steps/uncompress.py +1 -1
  59. unstructured_ingest/v2/processes/connectors/__init__.py +3 -0
  60. unstructured_ingest/v2/processes/connectors/airtable.py +235 -0
  61. unstructured_ingest/v2/processes/connectors/chroma.py +6 -1
  62. unstructured_ingest/v2/processes/connectors/elasticsearch.py +1 -1
  63. unstructured_ingest/v2/processes/connectors/fsspec/box.py +1 -1
  64. unstructured_ingest/v2/processes/connectors/fsspec/fsspec.py +4 -4
  65. unstructured_ingest/v2/processes/connectors/google_drive.py +2 -3
  66. unstructured_ingest/v2/processes/connectors/local.py +6 -5
  67. unstructured_ingest/v2/processes/connectors/milvus.py +1 -1
  68. unstructured_ingest/v2/processes/connectors/onedrive.py +8 -6
  69. unstructured_ingest/v2/processes/connectors/opensearch.py +1 -1
  70. unstructured_ingest/v2/processes/connectors/pinecone.py +38 -16
  71. unstructured_ingest/v2/processes/connectors/sharepoint.py +10 -6
  72. unstructured_ingest/v2/processes/embedder.py +41 -24
  73. unstructured_ingest/v2/processes/filter.py +1 -1
  74. unstructured_ingest/v2/processes/partitioner.py +3 -3
  75. unstructured_ingest/v2/utils.py +7 -0
  76. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/METADATA +212 -211
  77. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/RECORD +81 -72
  78. unstructured_ingest/evaluate.py +0 -338
  79. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/LICENSE.md +0 -0
  80. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/WHEEL +0 -0
  81. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/entry_points.txt +0 -0
  82. {unstructured_ingest-0.0.13.dist-info → unstructured_ingest-0.0.15.dist-info}/top_level.txt +0 -0
@@ -27,8 +27,6 @@ class Embedder(ReformatNode):
27
27
  return hashlib.sha256(json.dumps(hash_dict, sort_keys=True).encode()).hexdigest()[:32]
28
28
 
29
29
  def run(self, elements_json: str) -> Optional[str]:
30
- from unstructured.staging.base import elements_from_json
31
-
32
30
  try:
33
31
  elements_json_filename = os.path.basename(elements_json)
34
32
  filename_ext = os.path.basename(elements_json_filename)
@@ -46,12 +44,12 @@ class Embedder(ReformatNode):
46
44
  and json_path.is_file()
47
45
  and json_path.stat().st_size
48
46
  ):
49
- logger.debug(f"File exists: {json_path}, skipping embedding")
47
+ logger.debug(f"file exists: {json_path}, skipping embedding")
50
48
  return str(json_path)
51
- elements = elements_from_json(filename=elements_json)
49
+ with open(elements_json) as f:
50
+ elements = json.load(f)
52
51
  embedder = self.embedder_config.get_embedder()
53
- embedded_elements = embedder.embed_documents(elements=elements)
54
- element_dicts = [e.to_dict() for e in embedded_elements]
52
+ element_dicts = embedder.embed_documents(elements=elements)
55
53
  with open(json_path, "w", encoding="utf8") as output_f:
56
54
  logger.info(f"writing embeddings content to {json_path}")
57
55
  json.dump(element_dicts, output_f, ensure_ascii=False, indent=2)
@@ -24,12 +24,12 @@ class Reader(SourceNode):
24
24
  and doc.filename.is_file()
25
25
  and doc.filename.stat().st_size
26
26
  ):
27
- logger.info(f"File exists: {doc.filename}, skipping download")
27
+ logger.info(f"file exists: {doc.filename}, skipping download")
28
28
  # Still need to fetch metadata if file exists locally
29
29
  doc.update_source_metadata()
30
30
  else:
31
31
  serialized_doc = doc.to_json(redact_sensitive=True)
32
- logger.debug(f"Fetching {serialized_doc} - PID: {os.getpid()}")
32
+ logger.debug(f"fetching {serialized_doc} - PID: {os.getpid()}")
33
33
  if self.retry_strategy:
34
34
  self.retry_strategy(doc.get_file)
35
35
  else:
@@ -22,7 +22,7 @@ TAR_FILE_EXT = [".tar", ".tar.gz", ".tgz"]
22
22
 
23
23
  def uncompress_file(filename: str, path: Optional[str] = None) -> str:
24
24
  """
25
- Takes in a compressed zip or tar file and uncompresses it
25
+ Takes in a compressed zip or tar file and decompresses it
26
26
  """
27
27
  # Create path if it doesn't already exist
28
28
  if path:
@@ -65,7 +65,7 @@ def uncompress_tar_file(tar_filename: str, path: Optional[str] = None) -> str:
65
65
  logger.info(f"extracting tar {tar_filename} -> {path}")
66
66
  # NOTE: "r:*" mode opens both compressed (e.g ".tar.gz") and uncompressed ".tar" archives
67
67
  with tarfile.open(tar_filename, "r:*") as tfile:
68
- # NOTE(robinson: Mitigate against malicious content being extracted from the tar file.
68
+ # NOTE(robinson): Mitigate against malicious content being extracted from the tar file.
69
69
  # This was added in Python 3.12
70
70
  # Ref: https://docs.python.org/3/library/tarfile.html#extraction-filters
71
71
  if sys.version_info >= (3, 12):
@@ -113,6 +113,6 @@ class CompressionSourceConnectorMixin:
113
113
  read_config=new_read_configs,
114
114
  processor_config=new_process_configs,
115
115
  )
116
- logger.info(f"Created local source connector: {local_connector.to_json()}")
116
+ logger.info(f"created local source connector: {local_connector.to_json()}")
117
117
  local_connector.initialize()
118
118
  return local_connector.get_ingest_docs()
@@ -1,12 +1,15 @@
1
1
  import itertools
2
2
  import json
3
3
  from datetime import datetime
4
- from typing import Any, Optional, Sequence, cast
4
+ from typing import Any, Iterable, Optional, Sequence, TypeVar, cast
5
5
 
6
6
  DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
7
7
 
8
+ T = TypeVar("T")
9
+ IterableT = Iterable[T]
8
10
 
9
- def batch_generator(iterable, batch_size=100):
11
+
12
+ def batch_generator(iterable: IterableT, batch_size: int = 100) -> IterableT:
10
13
  """A helper function to break an iterable into batches of size batch_size."""
11
14
  it = iter(iterable)
12
15
  chunk = tuple(itertools.islice(it, batch_size))
@@ -16,23 +19,28 @@ def batch_generator(iterable, batch_size=100):
16
19
 
17
20
 
18
21
  def generator_batching_wbytes(
19
- iterable, batch_size_limit_bytes=15_000_000, max_batch_size: int = 1000
20
- ):
22
+ iterable: IterableT,
23
+ batch_size_limit_bytes: Optional[int] = None,
24
+ max_batch_size: Optional[int] = None,
25
+ ) -> IterableT:
26
+ if not batch_size_limit_bytes and not max_batch_size:
27
+ return iterable
21
28
  """A helper function to break an iterable into chunks of specified bytes."""
22
29
  current_batch, current_batch_size = [], 0
23
30
 
24
31
  for item in iterable:
25
32
  item_size_bytes = len(json.dumps(item).encode("utf-8"))
26
-
27
- if (
28
- current_batch_size + item_size_bytes <= batch_size_limit_bytes
29
- or len(current_batch) == 0 # prevent inifite yielding of empty batch
30
- ) and len(current_batch) < max_batch_size:
31
- current_batch.append(item)
32
- current_batch_size += item_size_bytes
33
- else:
33
+ if batch_size_limit_bytes and current_batch_size + item_size_bytes > batch_size_limit_bytes:
34
+ yield current_batch
35
+ current_batch, current_batch_size = [item], item_size_bytes
36
+ continue
37
+ if max_batch_size and len(current_batch) + 1 > max_batch_size:
34
38
  yield current_batch
35
39
  current_batch, current_batch_size = [item], item_size_bytes
40
+ continue
41
+
42
+ current_batch.append(item)
43
+ current_batch_size += item_size_bytes
36
44
 
37
45
  if current_batch:
38
46
  yield current_batch
@@ -10,13 +10,13 @@ def json_to_dict(json_string: str) -> t.Union[str, t.Dict[str, t.Any]]:
10
10
  try:
11
11
  return json.loads(json_string)
12
12
  except json.JSONDecodeError:
13
- # Not neccessary an error if it is a path or malformed json
13
+ # Not necessary an error if it is a path or malformed json
14
14
  pass
15
15
  try:
16
16
  # This is common when single quotes are used instead of double quotes
17
17
  return json.loads(json_string.replace("'", '"'))
18
18
  except json.JSONDecodeError:
19
- # Not neccessary an error if it is a path
19
+ # Not necessary an error if it is a path
20
20
  pass
21
21
  return json_string
22
22
 
@@ -102,7 +102,7 @@ class BaseCmd(ABC):
102
102
  cmd.params.extend(options)
103
103
  return cmd
104
104
 
105
- def get_pipline(
105
+ def get_pipeline(
106
106
  self,
107
107
  src: str,
108
108
  source_options: dict[str, Any],
@@ -122,7 +122,7 @@ class BaseCmd(ABC):
122
122
  pipeline_kwargs["chunker"] = chunker
123
123
  if filterer := self.get_filterer(options=source_options):
124
124
  pipeline_kwargs["filterer"] = filterer
125
- if embedder := self.get_embeder(options=source_options):
125
+ if embedder := self.get_embedder(options=source_options):
126
126
  pipeline_kwargs["embedder"] = embedder
127
127
  if dest:
128
128
  logger.debug(
@@ -160,7 +160,7 @@ class BaseCmd(ABC):
160
160
  return Filterer(config=filterer_configs)
161
161
 
162
162
  @staticmethod
163
- def get_embeder(options: dict[str, Any]) -> Optional[Embedder]:
163
+ def get_embedder(options: dict[str, Any]) -> Optional[Embedder]:
164
164
  embedder_config = extract_config(flat_data=options, config=EmbedderConfig)
165
165
  if not embedder_config.embedding_provider:
166
166
  return None
@@ -40,7 +40,7 @@ class DestCmd(BaseCmd):
40
40
  source_options: dict = ctx.parent.params if ctx.parent else {}
41
41
  conform_click_options(options)
42
42
  try:
43
- pipeline = self.get_pipline(
43
+ pipeline = self.get_pipeline(
44
44
  src=source_cmd,
45
45
  source_options=source_options,
46
46
  dest=self.cmd_name,
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
+ from typing import Any
3
4
 
4
5
  import click
5
6
  from pydantic import BaseModel
@@ -47,14 +48,14 @@ class SrcCmd(BaseCmd):
47
48
  options = self.consolidate_options(options=options)
48
49
  return options
49
50
 
50
- def cmd(self, ctx: click.Context, **options) -> None:
51
+ def cmd(self, ctx: click.Context, **options: dict[str, Any]) -> None:
51
52
  if ctx.invoked_subcommand:
52
53
  return
53
54
 
54
55
  conform_click_options(options)
55
56
  logger.setLevel(logging.DEBUG if options.get("verbose", False) else logging.INFO)
56
57
  try:
57
- pipeline = self.get_pipline(src=self.cmd_name, source_options=options)
58
+ pipeline = self.get_pipeline(src=self.cmd_name, source_options=options)
58
59
  pipeline.run()
59
60
  except Exception as e:
60
61
  logger.error(f"failed to run source command {self.cmd_name}: {e}", exc_info=True)
@@ -67,7 +67,7 @@ class FileOrJson(click.ParamType):
67
67
  return value
68
68
  self.fail(
69
69
  gettext(
70
- "{value} is not a valid json string nor an existing filepath.",
70
+ "{value} is neither a valid json string nor an existing filepath.",
71
71
  ).format(value=value),
72
72
  param,
73
73
  ctx,
@@ -11,21 +11,56 @@ DEFAULT_WORK_DIR = str((Path.home() / ".cache" / "unstructured" / "ingest" / "pi
11
11
  class ProcessorConfig(BaseModel):
12
12
  model_config = ConfigDict(arbitrary_types_allowed=True)
13
13
 
14
- reprocess: bool = False
15
- verbose: bool = False
16
- tqdm: bool = False
17
- work_dir: str = Field(default_factory=lambda: DEFAULT_WORK_DIR)
18
- num_processes: int = 2
19
- max_connections: Optional[int] = None
20
- raise_on_error: bool = False
14
+ reprocess: bool = Field(
15
+ default=False,
16
+ description="Reprocess a downloaded file even if the relevant structured "
17
+ "output .json file in output directory already exists.",
18
+ )
19
+ verbose: bool = Field(default=False)
20
+ tqdm: bool = Field(default=False, description="Display tqdm progress bar")
21
+ work_dir: str = Field(
22
+ default_factory=lambda: DEFAULT_WORK_DIR,
23
+ description="Where to place working files when processing each step",
24
+ )
25
+ num_processes: int = Field(
26
+ default=2, description="Number of parallel processes with which to process docs"
27
+ )
28
+ max_connections: Optional[int] = Field(
29
+ default=None, description="Limit of concurrent connectionts"
30
+ )
31
+ raise_on_error: bool = Field(
32
+ default=False,
33
+ description="Is set, will raise error if any doc in the pipeline fail. "
34
+ "Otherwise will log error and continue with other docs",
35
+ )
21
36
  disable_parallelism: bool = Field(
22
- default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true"
37
+ default_factory=lambda: os.getenv("INGEST_DISABLE_PARALLELISM", "false").lower() == "true",
38
+ )
39
+ preserve_downloads: bool = Field(
40
+ default=False, description="Don't delete downloaded files after process completes"
41
+ )
42
+ download_only: bool = Field(
43
+ default=False, description="skip the rest of the process after files are downloaded"
44
+ )
45
+ re_download: bool = Field(
46
+ default=False,
47
+ description="If set, will re-download downloaded files "
48
+ "regardless of if they already exist locally",
49
+ )
50
+ uncompress: bool = Field(
51
+ default=False,
52
+ description="Uncompress any archived files. Currently supporting "
53
+ "zip and tar files based on file extension.",
54
+ )
55
+ iter_delete: bool = Field(
56
+ default=False,
57
+ description="If limited on memory, this can be enabled to delete "
58
+ "cached content as it's used and no longer needed in the pipeline.",
59
+ )
60
+ delete_cache: bool = Field(
61
+ default=False,
62
+ description="If set, will delete the cache work directory when process finishes",
23
63
  )
24
- preserve_downloads: bool = False
25
- download_only: bool = False
26
- max_docs: Optional[int] = None
27
- re_download: bool = False
28
- uncompress: bool = False
29
64
 
30
65
  # OTEL support
31
66
  otel_endpoint: Optional[str] = Field(
@@ -101,7 +101,7 @@ class SensitiveFormatter(Formatter):
101
101
 
102
102
 
103
103
  def remove_root_handlers(logger: Logger) -> None:
104
- # NOTE(robinson) - in some environments such as Google Colab, there is a root handler
104
+ # NOTE(robinson): in some environments such as Google Colab, there is a root handler
105
105
  # that doesn't not mask secrets, meaning sensitive info such as api keys appear in logs.
106
106
  # Removing these when they exist prevents this behavior
107
107
  if logger.root.hasHandlers():
@@ -92,7 +92,7 @@ class OtelHandler:
92
92
  return None
93
93
  from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
94
94
 
95
- logger.debug(f"Adding otel exported at {otel_endpoint}")
95
+ logger.debug(f"adding otel exported at {otel_endpoint}")
96
96
  trace_exporter = OTLPSpanExporter()
97
97
  processor = SimpleSpanProcessor(trace_exporter)
98
98
  provider.add_span_processor(processor)
@@ -1,6 +1,9 @@
1
+ from __future__ import annotations
2
+
1
3
  import asyncio
2
4
  import logging
3
5
  import multiprocessing as mp
6
+ import shutil
4
7
  from abc import ABC, abstractmethod
5
8
  from concurrent.futures import ThreadPoolExecutor
6
9
  from dataclasses import dataclass
@@ -116,10 +119,10 @@ class PipelineStep(ABC):
116
119
  iterable = iterable or []
117
120
  if iterable:
118
121
  logger.info(
119
- f"Calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
122
+ f"calling {self.__class__.__name__} " f"with {len(iterable)} docs", # type: ignore
120
123
  )
121
124
  else:
122
- logger.info(f"Calling {self.__class__.__name__} with no inputs")
125
+ logger.info(f"calling {self.__class__.__name__} with no inputs")
123
126
  if self.context.async_supported and self.process.is_async():
124
127
  return self.process_async(iterable=iterable)
125
128
  if self.context.mp_supported:
@@ -132,7 +135,7 @@ class PipelineStep(ABC):
132
135
  async def _run_async(self, fn: Callable, **kwargs: Any) -> Optional[Any]:
133
136
  raise NotImplementedError
134
137
 
135
- def run(self, _fn: Optional[Callable] = None, **kwargs: Any) -> Optional[Any]:
138
+ def run(self, _fn: Callable[..., Any] | None = None, **kwargs: Any) -> Optional[Any]:
136
139
  kwargs = kwargs.copy()
137
140
  otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint, log_out=logger.debug)
138
141
  tracer = otel_handler.get_tracer()
@@ -178,6 +181,12 @@ class PipelineStep(ABC):
178
181
  def cache_dir(self) -> Path:
179
182
  return Path(self.context.work_dir) / self.identifier
180
183
 
184
+ def delete_cache(self):
185
+ if self.context.iter_delete and self.cache_dir.exists():
186
+ cache_dir = self.cache_dir
187
+ logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
188
+ shutil.rmtree(cache_dir)
189
+
181
190
 
182
191
  @dataclass
183
192
  class BatchPipelineStep(PipelineStep, ABC):
@@ -1,7 +1,11 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import multiprocessing as mp
5
+ import shutil
3
6
  from dataclasses import InitVar, dataclass, field
4
- from typing import Any, Optional, Union
7
+ from pathlib import Path
8
+ from typing import Any
5
9
 
6
10
  from unstructured_ingest.v2.interfaces import ProcessorConfig, Uploader
7
11
  from unstructured_ingest.v2.logger import logger, make_default_logger
@@ -48,33 +52,33 @@ class Pipeline:
48
52
  partitioner: InitVar[Partitioner]
49
53
  partitioner_step: PartitionStep = field(init=False)
50
54
 
51
- chunker: InitVar[Optional[Chunker]] = None
52
- chunker_step: ChunkStep = field(init=False, default=None)
55
+ chunker: InitVar[Chunker | None] = None
56
+ chunker_step: ChunkStep | None = field(init=False, default=None)
53
57
 
54
- embedder: InitVar[Optional[Embedder]] = None
55
- embedder_step: EmbedStep = field(init=False, default=None)
58
+ embedder: InitVar[Embedder | None] = None
59
+ embedder_step: EmbedStep | None = field(init=False, default=None)
56
60
 
57
- stager: InitVar[Optional[UploadStager]] = None
58
- stager_step: UploadStageStep = field(init=False, default=None)
61
+ stager: InitVar[UploadStager | None] = None
62
+ stager_step: UploadStageStep | None = field(init=False, default=None)
59
63
 
60
64
  uploader: InitVar[Uploader] = field(default=LocalUploader())
61
- uploader_step: UploadStep = field(init=False, default=None)
65
+ uploader_step: UploadStep | None = field(init=False, default=None)
62
66
 
63
- uncompress_step: UncompressStep = field(init=False, default=None)
67
+ uncompress_step: UncompressStep | None = field(init=False, default=None)
64
68
 
65
- filterer: InitVar[Optional[Filterer]] = None
66
- filter_step: FilterStep = field(init=False, default=None)
69
+ filterer: InitVar[Filterer | None] = None
70
+ filter_step: FilterStep | None = field(init=False, default=None)
67
71
 
68
72
  def __post_init__(
69
73
  self,
70
74
  indexer: IndexerT,
71
75
  downloader: DownloaderT,
72
76
  partitioner: Partitioner,
73
- chunker: Chunker = None,
74
- embedder: Embedder = None,
75
- stager: UploadStager = None,
76
- uploader: Uploader = None,
77
- filterer: Filterer = None,
77
+ chunker: Chunker | None = None,
78
+ embedder: Embedder | None = None,
79
+ stager: UploadStager | None = None,
80
+ uploader: Uploader | None = None,
81
+ filterer: Filterer | None = None,
78
82
  ):
79
83
  make_default_logger(level=logging.DEBUG if self.context.verbose else logging.INFO)
80
84
  otel_handler = OtelHandler(otel_endpoint=self.context.otel_endpoint)
@@ -113,7 +117,9 @@ class Pipeline:
113
117
  )
114
118
 
115
119
  def cleanup(self):
116
- pass
120
+ if self.context.delete_cache and Path(self.context.work_dir).exists():
121
+ logger.info(f"deleting cache directory: {self.context.work_dir}")
122
+ shutil.rmtree(self.context.work_dir)
117
123
 
118
124
  def log_statuses(self):
119
125
  if status := self.context.status:
@@ -136,7 +142,7 @@ class Pipeline:
136
142
  if self.context.status:
137
143
  raise PipelineError("Pipeline did not run successfully")
138
144
 
139
- def clean_results(self, results: Optional[list[Union[Any, list[Any]]]]) -> Optional[list[Any]]:
145
+ def clean_results(self, results: list[Any | list[Any]] | None) -> list[Any] | None:
140
146
  if not results:
141
147
  return None
142
148
  results = [r for r in results if r]
@@ -181,7 +187,7 @@ class Pipeline:
181
187
  return filtered_records
182
188
 
183
189
  def _run(self):
184
- logger.info(f"Running local pipline: {self} with configs: " f"{self.context.json()}")
190
+ logger.info(f"running local pipeline: {self} with configs: " f"{self.context.json()}")
185
191
  if self.context.mp_supported:
186
192
  manager = mp.Manager()
187
193
  self.context.status = manager.dict()
@@ -226,26 +232,33 @@ class Pipeline:
226
232
  logger.info("No files to process after filtering uncompressed content, exiting")
227
233
  return
228
234
 
229
- if not downloaded_data:
235
+ if not downloaded_data or self.context.download_only:
230
236
  return
231
237
 
232
238
  # Partition content
233
239
  elements = self.partitioner_step(downloaded_data)
240
+ # Download data non longer needed, delete if possible
241
+ self.downloader_step.delete_cache()
234
242
  elements = self.clean_results(results=elements)
235
243
  if not elements:
236
244
  logger.info("No files to process after partitioning, exiting")
237
245
  return
238
246
 
239
247
  # Run element specific modifiers
240
- for step in [self.chunker_step, self.embedder_step, self.stager_step]:
241
- elements = step(elements) if step else elements
248
+ last_step = self.partitioner_step
249
+ for step in [s for s in [self.chunker_step, self.embedder_step, self.stager_step] if s]:
250
+ elements = step(elements)
242
251
  elements = self.clean_results(results=elements)
252
+ # Delete data from previous step if possible since no longer needed
253
+ last_step.delete_cache()
254
+ last_step = step
243
255
  if not elements:
244
- logger.info(f"No files to process after {step.__class__.__name__}, exiting")
256
+ logger.info(f"no files to process after {step.__class__.__name__}, exiting")
245
257
  return
246
258
 
247
259
  # Upload the final result
248
260
  self.uploader_step(iterable=elements)
261
+ last_step.delete_cache()
249
262
 
250
263
  def __str__(self):
251
264
  s = [str(self.indexer_step)]
@@ -274,12 +287,12 @@ class Pipeline:
274
287
  downloader_config: DownloaderConfigT,
275
288
  source_connection_config: ConnectionConfig,
276
289
  partitioner_config: PartitionerConfig,
277
- filterer_config: FiltererConfig = None,
278
- chunker_config: Optional[ChunkerConfig] = None,
279
- embedder_config: Optional[EmbedderConfig] = None,
280
- destination_connection_config: Optional[ConnectionConfig] = None,
281
- stager_config: Optional[UploadStagerConfigT] = None,
282
- uploader_config: Optional[UploaderConfigT] = None,
290
+ filterer_config: FiltererConfig | None = None,
291
+ chunker_config: ChunkerConfig | None = None,
292
+ embedder_config: EmbedderConfig | None = None,
293
+ destination_connection_config: ConnectionConfig | None = None,
294
+ stager_config: UploadStagerConfigT | None = None,
295
+ uploader_config: UploaderConfigT | None = None,
283
296
  ) -> "Pipeline":
284
297
  # Get registry key based on indexer config
285
298
  source_entry = {
@@ -29,7 +29,7 @@ class ChunkStep(PipelineStep):
29
29
 
30
30
  def __post_init__(self):
31
31
  config = self.process.config.json() if self.process.config else None
32
- logger.info(f"Created {self.identifier} with configs: {config}")
32
+ logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_chunk(self, filepath: Path, file_data: FileData) -> bool:
35
35
  if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class ChunkStep(PipelineStep):
44
44
 
45
45
  def _save_output(self, output_filepath: str, chunked_content: list[dict]):
46
46
  with open(str(output_filepath), "w") as f:
47
- logger.debug(f"Writing chunker output to: {output_filepath}")
47
+ logger.debug(f"writing chunker output to: {output_filepath}")
48
48
  json.dump(chunked_content, f, indent=2)
49
49
 
50
50
  async def _run_async(
@@ -54,7 +54,7 @@ class ChunkStep(PipelineStep):
54
54
  file_data = FileData.from_file(path=file_data_path)
55
55
  output_filepath = self.get_output_filepath(filename=path)
56
56
  if not self.should_chunk(filepath=output_filepath, file_data=file_data):
57
- logger.debug(f"Skipping chunking, output already exists: {output_filepath}")
57
+ logger.debug(f"skipping chunking, output already exists: {output_filepath}")
58
58
  return ChunkStepResponse(file_data_path=file_data_path, path=str(output_filepath))
59
59
  fn_kwargs = {"elements_filepath": path}
60
60
  if not asyncio.iscoroutinefunction(fn):
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import hashlib
3
3
  import json
4
+ import shutil
4
5
  from dataclasses import dataclass
5
6
  from pathlib import Path
6
7
  from typing import Callable, Optional, TypedDict, TypeVar
@@ -82,7 +83,7 @@ class DownloadStep(PipelineStep):
82
83
  f"match size of local file: {file_size_bytes}, updating"
83
84
  )
84
85
  file_data.metadata.filesize_bytes = file_size_bytes
85
- logger.debug(f"Updating file data with new content: {file_data.to_dict()}")
86
+ logger.debug(f"updating file data with new content: {file_data.to_dict()}")
86
87
  with file_data_path.open("w") as file:
87
88
  json.dump(file_data.to_dict(), file, indent=2)
88
89
 
@@ -90,7 +91,7 @@ class DownloadStep(PipelineStep):
90
91
  file_data = FileData.from_file(path=file_data_path)
91
92
  download_path = self.process.get_download_path(file_data=file_data)
92
93
  if not self.should_download(file_data=file_data, file_data_path=file_data_path):
93
- logger.debug(f"Skipping download, file already exists locally: {download_path}")
94
+ logger.debug(f"skipping download, file already exists locally: {download_path}")
94
95
  self.update_file_data(
95
96
  file_data=file_data,
96
97
  file_data_path=Path(file_data_path),
@@ -185,3 +186,17 @@ class DownloadStep(PipelineStep):
185
186
  if extras:
186
187
  hashable_string += "".join(extras)
187
188
  return hashlib.sha256(hashable_string.encode()).hexdigest()[:12]
189
+
190
+ @property
191
+ def cache_dir(self) -> Path:
192
+ return self.process.download_config.download_dir
193
+
194
+ def delete_cache(self):
195
+ if (
196
+ self.context.iter_delete
197
+ and not self.context.preserve_downloads
198
+ and self.cache_dir.exists()
199
+ ):
200
+ cache_dir = self.cache_dir
201
+ logger.info(f"deleting {self.identifier} cache dir {cache_dir}")
202
+ shutil.rmtree(cache_dir)
@@ -29,7 +29,7 @@ class EmbedStep(PipelineStep):
29
29
 
30
30
  def __post_init__(self):
31
31
  config = self.process.config.json() if self.process.config else None
32
- logger.info(f"Created {self.identifier} with configs: {config}")
32
+ logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_embed(self, filepath: Path, file_data: FileData) -> bool:
35
35
  if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class EmbedStep(PipelineStep):
44
44
 
45
45
  def _save_output(self, output_filepath: str, embedded_content: list[dict]):
46
46
  with open(str(output_filepath), "w") as f:
47
- logger.debug(f"Writing embedded output to: {output_filepath}")
47
+ logger.debug(f"writing embedded output to: {output_filepath}")
48
48
  json.dump(embedded_content, f, indent=2)
49
49
 
50
50
  async def _run_async(self, fn: Callable, path: str, file_data_path: str) -> EmbedStepResponse:
@@ -52,7 +52,7 @@ class EmbedStep(PipelineStep):
52
52
  file_data = FileData.from_file(path=file_data_path)
53
53
  output_filepath = self.get_output_filepath(filename=path)
54
54
  if not self.should_embed(filepath=output_filepath, file_data=file_data):
55
- logger.debug(f"Skipping embedding, output already exists: {output_filepath}")
55
+ logger.debug(f"skipping embedding, output already exists: {output_filepath}")
56
56
  return EmbedStepResponse(file_data_path=file_data_path, path=str(output_filepath))
57
57
  fn_kwargs = {"elements_filepath": path}
58
58
  if not asyncio.iscoroutinefunction(fn):
@@ -17,7 +17,7 @@ class FilterStep(PipelineStep):
17
17
 
18
18
  def __post_init__(self):
19
19
  config = self.process.config.json() if self.process.config else None
20
- logger.info(f"Created {self.identifier} with configs: {config}")
20
+ logger.info(f"created {self.identifier} with configs: {config}")
21
21
 
22
22
  async def _run_async(self, fn: Callable, file_data_path: str, **kwargs) -> Optional[dict]:
23
23
  file_data = FileData.from_file(path=file_data_path)
@@ -28,14 +28,14 @@ class IndexStep(PipelineStep):
28
28
  self.process.connection_config.json() if self.process.connection_config else None
29
29
  )
30
30
  logger.info(
31
- f"Created {self.identifier} with configs: {config}, "
31
+ f"created {self.identifier} with configs: {config}, "
32
32
  f"connection configs: {connection_config}"
33
33
  )
34
34
 
35
35
  @instrument(span_name=STEP_ID)
36
36
  def run(self) -> Generator[str, None, None]:
37
37
  for file_data in self.process.run():
38
- logger.debug(f"Generated file data: {file_data.to_dict()}")
38
+ logger.debug(f"generated file data: {file_data.to_dict()}")
39
39
  try:
40
40
  record_hash = self.get_hash(extras=[file_data.identifier])
41
41
  filename = f"{record_hash}.json"
@@ -29,7 +29,7 @@ class PartitionStep(PipelineStep):
29
29
 
30
30
  def __post_init__(self):
31
31
  config = self.process.config.json()
32
- logger.info(f"Created {self.identifier} with configs: {config}")
32
+ logger.info(f"created {self.identifier} with configs: {config}")
33
33
 
34
34
  def should_partition(self, filepath: Path, file_data: FileData) -> bool:
35
35
  if self.context.reprocess or file_data.reprocess:
@@ -44,7 +44,7 @@ class PartitionStep(PipelineStep):
44
44
 
45
45
  def _save_output(self, output_filepath: str, partitioned_content: list[dict]):
46
46
  with open(str(output_filepath), "w") as f:
47
- logger.debug(f"Writing partitioned output to: {output_filepath}")
47
+ logger.debug(f"writing partitioned output to: {output_filepath}")
48
48
  json.dump(partitioned_content, f, indent=2)
49
49
 
50
50
  async def _run_async(
@@ -54,7 +54,7 @@ class PartitionStep(PipelineStep):
54
54
  file_data = FileData.from_file(path=file_data_path)
55
55
  output_filepath = self.get_output_filepath(filename=Path(file_data_path))
56
56
  if not self.should_partition(filepath=output_filepath, file_data=file_data):
57
- logger.debug(f"Skipping partitioning, output already exists: {output_filepath}")
57
+ logger.debug(f"skipping partitioning, output already exists: {output_filepath}")
58
58
  return PartitionStepResponse(file_data_path=file_data_path, path=str(output_filepath))
59
59
  fn_kwargs = {"filename": path, "metadata": file_data.metadata.to_dict()}
60
60
  if not asyncio.iscoroutinefunction(fn):